# Demo Wordle RL

Demo of training a DQN agent with a hindsight experience replay buffer in a Wordle environment

In [5]:
from wordle_rl import WordleEnv
import numpy as np
import gym
# import torch
from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy
from stable_baselines3 import HerReplayBuffer, DQN

In [2]:
env = WordleEnv("config.json")

In [3]:
model = DQN(policy='MultiInputPolicy',     
    replay_buffer_class=HerReplayBuffer,
    # Parameters for HER
    replay_buffer_kwargs=dict(
        n_sampled_goal=100,
        goal_selection_strategy='future',
        online_sampling=True,
        max_episode_length=6),
    env=env, 
    verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [4]:
# model = DQN("MlpPolicy",env,verbose=1)

In [4]:
# might take a few seconds depending on your CPU
model.learn(total_timesteps=1000)

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 6        |
|    ep_rew_mean      | 203      |
|    exploration_rate | 0.772    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 1349     |
|    time_elapsed     | 0        |
|    total_timesteps  | 24       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 6        |
|    ep_rew_mean      | 210      |
|    exploration_rate | 0.544    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 1278     |
|    time_elapsed     | 0        |
|    total_timesteps  | 48       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 6        |
|    ep_rew_mean      | 196      |
|    exploration_rate | 0.316    |
| time/               |          |
|    episodes       

<stable_baselines3.dqn.dqn.DQN at 0x7f80534274f0>

In [6]:
obs = env.reset()
for i in range(100):
    action, _state = model.predict(obs, deterministic=False)
    obs, reward, done, info = env.step(action)
    env.render()
    if done:
        obs = env.reset()
        break;

Guessed word: benni
Total reward: 1
Guessed word: jewed
Total reward: 1
Guessed word: filks
Total reward: 51
Guessed word: decad
Total reward: 51
Guessed word: filks
Total reward: -4949
Guessed word: nitre
Total reward: -4898
{'observation': array([[[ 2.,  5., 14., 14.,  9.],
        [10.,  5., 23.,  5.,  4.],
        [ 6.,  9., 12., 11., 19.],
        [ 4.,  5.,  3.,  1.,  4.],
        [ 6.,  9., 12., 11., 19.],
        [14.,  9., 20., 18.,  5.]],

       [[27., 27., 27., 27., 28.],
        [27., 27., 27., 27., 27.],
        [27., 29., 27., 27., 27.],
        [27., 27., 27., 27., 27.],
        [27., 29., 27., 27., 27.],
        [27., 29., 27., 28., 27.]]]), 'achieved_goal': array([[[ 2.,  5., 14., 14.,  9.],
        [10.,  5., 23.,  5.,  4.],
        [ 6.,  9., 12., 11., 19.],
        [ 4.,  5.,  3.,  1.,  4.],
        [ 6.,  9., 12., 11., 19.],
        [14.,  9., 20., 18.,  5.]],

       [[27., 27., 27., 27., 28.],
        [27., 27., 27., 27., 27.],
        [27., 29., 27., 27., 27.],

In [21]:
model.save("wordle_rl_model_dqn")