In [1]:
import numpy as np
import gym
import gym_ttt
from warnings import filterwarnings
filterwarnings('ignore')


def callback(lcl, _glb):
    is_solved = lcl['t'] > 100 and sum(lcl['episode_rewards'][-101:-1]) / 100 >= 199
    return is_solved

env = gym.make('ttt-v1')

In [2]:
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy

In [3]:
def evaluate(model, num_episodes=100):
    """
    Evaluate a RL agent
    :param model: (BaseRLModel object) the RL Agent
    :param num_episodes: (int) number of episodes to evaluate it
    :return: (float) Mean reward for the last num_episodes
    """
    # This function will only work for a single Environment
    env = model.get_env()
    all_episode_rewards = []
    for i in range(num_episodes):
        episode_rewards = []
        done = False
        obs = env.reset()
        while not done:
            # _states are only useful when using LSTM policies
            action, _states = model.predict(obs)
            # here, action, rewards and dones are arrays
            # because we are using vectorized env
            obs, reward, done, info = env.step(action)
            episode_rewards.append(reward)

        all_episode_rewards.append(sum(episode_rewards))

    mean_episode_reward = np.mean(all_episode_rewards)
    print("Mean reward:", mean_episode_reward, "Num episodes:", num_episodes)

    return mean_episode_reward

In [4]:
model = PPO('MlpPolicy', env, verbose=1, learning_rate=0.0001)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [5]:
# Random Agent, before training
mean_reward_before_train = evaluate(model, num_episodes=100)

   0 1 2 
  --------
0 |O X O |
1 |O X X |
2 |X X O |
  --------
   0 1 2 
  --------
0 |X X O |
1 |X O O |
2 |X - X |
  --------
   0 1 2 
  --------
0 |X - X |
1 |- O X |
2 |O O X |
  --------
   0 1 2 
  --------
0 |O - O |
1 |- O X |
2 |X X X |
  --------
   0 1 2 
  --------
0 |O X O |
1 |X O O |
2 |X O X |
  --------
   0 1 2 
  --------
0 |X O O |
1 |X O X |
2 |X X O |
  --------
   0 1 2 
  --------
0 |O - X |
1 |X X X |
2 |O - - |
  --------
   0 1 2 
  --------
0 |X O X |
1 |O X O |
2 |X X O |
  --------
   0 1 2 
  --------
0 |X X O |
1 |X - - |
2 |X O - |
  --------
   0 1 2 
  --------
0 |O O X |
1 |X O X |
2 |O X X |
  --------
   0 1 2 
  --------
0 |O X O |
1 |O X X |
2 |X X O |
  --------
   0 1 2 
  --------
0 |- - O |
1 |O O O |
2 |X - X |
  --------
   0 1 2 
  --------
0 |O O X |
1 |X X O |
2 |X O O |
  --------
   0 1 2 
  --------
0 |X O X |
1 |X X O |
2 |O O X |
  --------
   0 1 2 
  --------
0 |X X X |
1 |O O X |
2 |- O O |
  --------
   0 1 2 
  --------
0 |O

In [6]:
model.learn(total_timesteps=10000)

   0 1 2 
  --------
0 |O O X |
1 |X O O |
2 |X X X |
  --------
   0 1 2 
  --------
0 |X X X |
1 |O - O |
2 |O X X |
  --------
   0 1 2 
  --------
0 |O X O |
1 |O X - |
2 |X X O |
  --------
   0 1 2 
  --------
0 |- O - |
1 |X O X |
2 |X O - |
  --------
   0 1 2 
  --------
0 |X X X |
1 |- - - |
2 |O - O |
  --------
   0 1 2 
  --------
0 |O X X |
1 |O X X |
2 |X O O |
  --------
   0 1 2 
  --------
0 |X X O |
1 |O O X |
2 |X X O |
  --------
   0 1 2 
  --------
0 |O X O |
1 |X X X |
2 |O - - |
  --------
   0 1 2 
  --------
0 |X - - |
1 |X - - |
2 |X - O |
  --------
   0 1 2 
  --------
0 |X X O |
1 |O - O |
2 |X X O |
  --------
   0 1 2 
  --------
0 |X X O |
1 |O X X |
2 |X O O |
  --------
   0 1 2 
  --------
0 |O O O |
1 |X - - |
2 |- - - |
  --------
   0 1 2 
  --------
0 |X X - |
1 |O O O |
2 |X - X |
  --------
   0 1 2 
  --------
0 |- X - |
1 |O O O |
2 |X - - |
  --------
   0 1 2 
  --------
0 |O X O |
1 |X X O |
2 |X X X |
  --------
   0 1 2 
  --------
0 |X

   0 1 2 
  --------
0 |- O - |
1 |X X X |
2 |X - O |
  --------
   0 1 2 
  --------
0 |O X X |
1 |O O O |
2 |O X - |
  --------
   0 1 2 
  --------
0 |X O O |
1 |O - O |
2 |X X X |
  --------
   0 1 2 
  --------
0 |X O - |
1 |X - X |
2 |X O - |
  --------
   0 1 2 
  --------
0 |O X X |
1 |X X O |
2 |X O O |
  --------
   0 1 2 
  --------
0 |X O X |
1 |O X X |
2 |X O O |
  --------
   0 1 2 
  --------
0 |O X O |
1 |O X X |
2 |X O X |
  --------
   0 1 2 
  --------
0 |O X - |
1 |O X X |
2 |- X X |
  --------
   0 1 2 
  --------
0 |O X O |
1 |X X X |
2 |O O X |
  --------
   0 1 2 
  --------
0 |X O X |
1 |O X - |
2 |O O O |
  --------
   0 1 2 
  --------
0 |X X O |
1 |X X O |
2 |- O X |
  --------
   0 1 2 
  --------
0 |O - O |
1 |X X X |
2 |X - - |
  --------
   0 1 2 
  --------
0 |O X - |
1 |X O X |
2 |X O O |
  --------
   0 1 2 
  --------
0 |- X O |
1 |O O X |
2 |X X X |
  --------
   0 1 2 
  --------
0 |- - O |
1 |- O - |
2 |X X X |
  --------
   0 1 2 
  --------
0 |O

   0 1 2 
  --------
0 |X X X |
1 |O - - |
2 |- O - |
  --------
   0 1 2 
  --------
0 |O X O |
1 |X X - |
2 |O X O |
  --------
   0 1 2 
  --------
0 |X X O |
1 |X O X |
2 |O X O |
  --------
   0 1 2 
  --------
0 |X O X |
1 |X X O |
2 |O O X |
  --------
   0 1 2 
  --------
0 |O X O |
1 |O - X |
2 |O O X |
  --------
   0 1 2 
  --------
0 |O O X |
1 |X O O |
2 |X - O |
  --------
   0 1 2 
  --------
0 |O - - |
1 |O - X |
2 |O - X |
  --------
   0 1 2 
  --------
0 |X O - |
1 |X - X |
2 |O O O |
  --------
   0 1 2 
  --------
0 |X X X |
1 |O X O |
2 |X O O |
  --------
   0 1 2 
  --------
0 |O X X |
1 |O O X |
2 |- - O |
  --------
   0 1 2 
  --------
0 |- O - |
1 |- O X |
2 |- O - |
  --------
   0 1 2 
  --------
0 |- X X |
1 |O O X |
2 |X - X |
  --------
   0 1 2 
  --------
0 |O O X |
1 |X X X |
2 |O O X |
  --------
   0 1 2 
  --------
0 |X X O |
1 |O O X |
2 |O O X |
  --------
   0 1 2 
  --------
0 |X X O |
1 |O O X |
2 |X O X |
  --------
   0 1 2 
  --------
0 |O

   0 1 2 
  --------
0 |O X O |
1 |X X O |
2 |- X X |
  --------
   0 1 2 
  --------
0 |X O X |
1 |X O O |
2 |X - - |
  --------
   0 1 2 
  --------
0 |X X - |
1 |X X O |
2 |X O O |
  --------
   0 1 2 
  --------
0 |X O X |
1 |X - X |
2 |O O X |
  --------
   0 1 2 
  --------
0 |X O - |
1 |X - - |
2 |X - O |
  --------
   0 1 2 
  --------
0 |O O - |
1 |X X X |
2 |- X X |
  --------
   0 1 2 
  --------
0 |X X X |
1 |O X O |
2 |X O X |
  --------
   0 1 2 
  --------
0 |X O O |
1 |O X O |
2 |O X X |
  --------
   0 1 2 
  --------
0 |O O X |
1 |X X X |
2 |O X O |
  --------
   0 1 2 
  --------
0 |O X X |
1 |O - X |
2 |X - X |
  --------
   0 1 2 
  --------
0 |X X - |
1 |O O O |
2 |- - X |
  --------
   0 1 2 
  --------
0 |O X X |
1 |O X - |
2 |- X O |
  --------
   0 1 2 
  --------
0 |X - X |
1 |O O O |
2 |X O O |
  --------
   0 1 2 
  --------
0 |- O - |
1 |X O - |
2 |- O - |
  --------
   0 1 2 
  --------
0 |O - O |
1 |X X X |
2 |- - - |
  --------
   0 1 2 
  --------
0 |O

0 |X X X |
1 |X O X |
2 |O O X |
  --------
   0 1 2 
  --------
0 |O O - |
1 |X X X |
2 |- O - |
  --------
   0 1 2 
  --------
0 |X X - |
1 |O X O |
2 |X X O |
  --------
   0 1 2 
  --------
0 |X O O |
1 |O X O |
2 |X O X |
  --------
   0 1 2 
  --------
0 |O X X |
1 |O O O |
2 |X - O |
  --------
   0 1 2 
  --------
0 |X - - |
1 |- O X |
2 |O O O |
  --------
   0 1 2 
  --------
0 |O X X |
1 |O O X |
2 |X - O |
  --------
   0 1 2 
  --------
0 |O O O |
1 |O X X |
2 |- X - |
  --------
   0 1 2 
  --------
0 |X X X |
1 |O X O |
2 |O X O |
  --------
   0 1 2 
  --------
0 |O X X |
1 |X O X |
2 |X O O |
  --------
   0 1 2 
  --------
0 |O X X |
1 |- X - |
2 |O X O |
  --------
   0 1 2 
  --------
0 |X - O |
1 |X O O |
2 |X O X |
  --------
   0 1 2 
  --------
0 |O X X |
1 |O X X |
2 |X O O |
  --------
   0 1 2 
  --------
0 |O X X |
1 |O X X |
2 |X O X |
  --------
   0 1 2 
  --------
0 |X X O |
1 |X O X |
2 |X O X |
  --------
   0 1 2 
  --------
0 |X - X |
1 |X O - |
2 |

   0 1 2 
  --------
0 |X O - |
1 |O - O |
2 |X X X |
  --------
   0 1 2 
  --------
0 |- O O |
1 |X X X |
2 |- X O |
  --------
   0 1 2 
  --------
0 |X - O |
1 |X - X |
2 |X O O |
  --------
   0 1 2 
  --------
0 |O O O |
1 |O - X |
2 |- X X |
  --------
   0 1 2 
  --------
0 |O X X |
1 |X X O |
2 |X O X |
  --------
   0 1 2 
  --------
0 |O X O |
1 |O O X |
2 |X X O |
  --------
   0 1 2 
  --------
0 |- - X |
1 |O O X |
2 |X O X |
  --------
   0 1 2 
  --------
0 |X O X |
1 |O O X |
2 |O X O |
  --------
   0 1 2 
  --------
0 |- - O |
1 |X X X |
2 |O O X |
  --------
   0 1 2 
  --------
0 |X X O |
1 |O X X |
2 |X O O |
  --------
   0 1 2 
  --------
0 |O O X |
1 |O - X |
2 |- X X |
  --------
   0 1 2 
  --------
0 |O O X |
1 |X X O |
2 |O X X |
  --------
   0 1 2 
  --------
0 |X X - |
1 |X - - |
2 |X O O |
  --------
   0 1 2 
  --------
0 |X O O |
1 |X O - |
2 |X X O |
  --------
   0 1 2 
  --------
0 |X - O |
1 |- X - |
2 |O - X |
  --------
   0 1 2 
  --------
0 |X

   0 1 2 
  --------
0 |O O X |
1 |O X O |
2 |X X O |
  --------
   0 1 2 
  --------
0 |X X X |
1 |O X O |
2 |- - - |
  --------
   0 1 2 
  --------
0 |X - O |
1 |X - O |
2 |O X O |
  --------
   0 1 2 
  --------
0 |- - X |
1 |X - - |
2 |O O O |
  --------
   0 1 2 
  --------
0 |O O - |
1 |X X X |
2 |X X O |
  --------
   0 1 2 
  --------
0 |O O O |
1 |- O O |
2 |O X X |
  --------
   0 1 2 
  --------
0 |O X O |
1 |X X X |
2 |X O O |
  --------
   0 1 2 
  --------
0 |X X X |
1 |- O X |
2 |- O - |
  --------
   0 1 2 
  --------
0 |O O O |
1 |X X - |
2 |- - - |
  --------
   0 1 2 
  --------
0 |X X O |
1 |X O O |
2 |X X X |
  --------
   0 1 2 
  --------
0 |O - X |
1 |- - X |
2 |O - X |
  --------
   0 1 2 
  --------
0 |- X X |
1 |X O X |
2 |O O X |
  --------
   0 1 2 
  --------
0 |O O - |
1 |- O X |
2 |X X X |
  --------
   0 1 2 
  --------
0 |O X - |
1 |X O O |
2 |X - O |
  --------
   0 1 2 
  --------
0 |O X - |
1 |O O - |
2 |X X X |
  --------
   0 1 2 
  --------
0 |X

   0 1 2 
  --------
0 |O O O |
1 |X O - |
2 |X O X |
  --------
   0 1 2 
  --------
0 |X O X |
1 |O O X |
2 |O - X |
  --------
   0 1 2 
  --------
0 |O O - |
1 |X X X |
2 |X X - |
  --------
   0 1 2 
  --------
0 |- X X |
1 |O O X |
2 |O O X |
  --------
   0 1 2 
  --------
0 |X X X |
1 |- - O |
2 |O - - |
  --------
   0 1 2 
  --------
0 |O O O |
1 |X - - |
2 |- X X |
  --------
   0 1 2 
  --------
0 |- O X |
1 |O O O |
2 |- X O |
  --------
   0 1 2 
  --------
0 |O - - |
1 |X X X |
2 |X O O |
  --------
   0 1 2 
  --------
0 |O - - |
1 |X X X |
2 |- O - |
  --------
   0 1 2 
  --------
0 |- O O |
1 |X O X |
2 |X X X |
  --------
   0 1 2 
  --------
0 |X O O |
1 |X O X |
2 |X X O |
  --------
   0 1 2 
  --------
0 |- X O |
1 |O O O |
2 |O O X |
  --------
   0 1 2 
  --------
0 |- - - |
1 |X - X |
2 |O O O |
  --------
   0 1 2 
  --------
0 |O - X |
1 |- - X |
2 |X O X |
  --------
   0 1 2 
  --------
0 |X X O |
1 |X O O |
2 |O - O |
  --------
   0 1 2 
  --------
0 |X

<stable_baselines3.ppo.ppo.PPO at 0x166099350>

In [7]:
# Random Agent, after training
mean_reward_after_train = evaluate(model, num_episodes=100)

   0 1 2 
  --------
0 |O X O |
1 |- - O |
2 |- X O |
  --------
   0 1 2 
  --------
0 |X O X |
1 |O O O |
2 |- X O |
  --------
   0 1 2 
  --------
0 |X X O |
1 |O X X |
2 |X O O |
  --------
   0 1 2 
  --------
0 |O X O |
1 |- O - |
2 |X X X |
  --------
   0 1 2 
  --------
0 |O - X |
1 |- O - |
2 |- X O |
  --------
   0 1 2 
  --------
0 |X O O |
1 |X O O |
2 |O X X |
  --------
   0 1 2 
  --------
0 |X X - |
1 |O X X |
2 |O O X |
  --------
   0 1 2 
  --------
0 |- X - |
1 |O X - |
2 |X X O |
  --------
   0 1 2 
  --------
0 |- O X |
1 |- O O |
2 |O O - |
  --------
   0 1 2 
  --------
0 |- O O |
1 |O O O |
2 |X X - |
  --------
   0 1 2 
  --------
0 |X O X |
1 |- O O |
2 |X O X |
  --------
   0 1 2 
  --------
0 |O O X |
1 |O X O |
2 |X O X |
  --------
   0 1 2 
  --------
0 |O X X |
1 |X O O |
2 |- X O |
  --------
   0 1 2 
  --------
0 |- X O |
1 |- - O |
2 |X X O |
  --------
   0 1 2 
  --------
0 |O X - |
1 |- X - |
2 |- X O |
  --------
   0 1 2 
  --------
0 |X

In [8]:
model.save(f'models/Agent_PPO')

In [9]:
obs = env.reset()
for _ in range(100):
    done = False

    while not done:
        action, _state = model.predict(obs)

        obs, reward, done, info = env.step(action)
        #env.render()

    obs = env.reset()

   0 1 2 
  --------
0 |- X O |
1 |X X X |
2 |O - O |
  --------
   0 1 2 
  --------
0 |X X X |
1 |- O X |
2 |O O X |
  --------
   0 1 2 
  --------
0 |X X O |
1 |O X X |
2 |X O O |
  --------
   0 1 2 
  --------
0 |X X X |
1 |O O X |
2 |- - O |
  --------
   0 1 2 
  --------
0 |O X O |
1 |O - - |
2 |O X X |
  --------
   0 1 2 
  --------
0 |- - X |
1 |- O X |
2 |O - X |
  --------
   0 1 2 
  --------
0 |O X O |
1 |- O X |
2 |X O O |
  --------
   0 1 2 
  --------
0 |O X X |
1 |- X - |
2 |O O O |
  --------
   0 1 2 
  --------
0 |O X O |
1 |X O X |
2 |O - O |
  --------
   0 1 2 
  --------
0 |O X O |
1 |O X X |
2 |O - X |
  --------
   0 1 2 
  --------
0 |X X X |
1 |- X O |
2 |- O - |
  --------
   0 1 2 
  --------
0 |O X O |
1 |O O X |
2 |O - X |
  --------
   0 1 2 
  --------
0 |O O O |
1 |O O X |
2 |- X X |
  --------
   0 1 2 
  --------
0 |X - X |
1 |- X X |
2 |O O X |
  --------
   0 1 2 
  --------
0 |O O O |
1 |X - O |
2 |X - X |
  --------
   0 1 2 
  --------
0 |O