In [None]:
from stable_baselines3 import PPO
from stable_baselines3 import DQN
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.evaluation import evaluate_policy
from battle_environment import BattleEnvironment
import os
import pprint
import matplotlib.pyplot as plt

In [None]:
for i in range(1, 100):
    if not os.path.exists(f'models/models{i}'):
        FOLDER =  f'models/models{i}'
        LOG_DIR = f'{FOLDER}/logs'
        os.makedirs(FOLDER)
        os.makedirs(LOG_DIR)
        break

In [None]:
# Winrate Graph
fig, ax = plt.subplots()
ax.set(xlabel='# of games played', ylabel='Percentage of agent wins (%)')
ax.grid()
x = []
y = []

class TrainAndLoggingCallback(BaseCallback):
    def __init__(self, check_freq, save_path, env, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path
        self.num_wins = 0
        self.total_games = 0
        self.games = 0
        self.env = env
        self.total_wins = 0

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, f'{self.n_calls}')
            self.model.save(model_path)
        if self.n_calls % 10000 == 0 and self.env.team['red']['wins'] > 0 and self.n_calls > 0:
            self.num_wins = self.env.team['red']['wins'] - self.total_wins
            self.games = self.env.total_games - self.total_games
            self.total_wins = self.env.team['red']['wins']
            self.total_games = self.env.total_games
            print(f"\n\n\n\n--------------------\ntotal games:{self.total_games}\ntimesteps:{self.n_calls}\nwin percentage:{round(self.num_wins/self.games * 100, 2)}\n--------------------\n\n\n\n")
            x.append(self.env.total_games)
            y.append(self.num_wins/self.games * 100)
        return True

save_freq = 10000

In [None]:
cf = {
    'hit_base_reward': 10,
    'hit_plane_reward': 1,
    'miss_punishment': 0,
    'too_long_punishment': 0,
    'lose_punishment': -3
}

In [None]:
timesteps = 10000000
saved_timesteps = timesteps // save_freq * save_freq
file = open(f"{FOLDER}/results.txt", 'a')
print(f"Timesteps:{saved_timesteps}\nConfig:{pprint.pformat(cf)}", file=file)

PPO

In [None]:
CHECKPOINT_DIR = f'{FOLDER}/train_PPO'

# Create the environment and model
env = BattleEnvironment(show=False, hit_base_reward=cf['hit_base_reward'], hit_plane_reward=cf['hit_plane_reward'], miss_punishment=cf['miss_punishment'], 
    too_long_punishment=cf['too_long_punishment'], lose_punishment=cf['lose_punishment'])
callback = TrainAndLoggingCallback(check_freq=save_freq, save_path=CHECKPOINT_DIR, env=env)
model = PPO('MlpPolicy', env, tensorboard_log=LOG_DIR, verbose=1)

# Train the model and save graph
model.learn(total_timesteps=timesteps, callback=callback)
model.save(f"{FOLDER}/final_model_PPO")
del model

ax.plot(x, y)
x.clear()
y.clear()

DQN

In [None]:
CHECKPOINT_DIR = f'{FOLDER}/train_PPO'

# Create the environment and model
env = BattleEnvironment(show=False, hit_base_reward=cf['hit_base_reward'], hit_plane_reward=cf['hit_plane_reward'], miss_punishment=cf['miss_punishment'], 
    too_long_punishment=cf['too_long_punishment'], lose_punishment=cf['lose_punishment'])
callback = TrainAndLoggingCallback(check_freq=save_freq, save_path=CHECKPOINT_DIR, env=env)
model = DQN('MlpPolicy', env, tensorboard_log=LOG_DIR, verbose=1)

# Train the model and save graph
model.learn(total_timesteps=timesteps, callback=callback)
model.save(f"{FOLDER}/final_model_DQN")
del model

ax.plot(callback.x, callback.y)
fig.savefig(f"{FOLDER}/percent_win.png")

PPO eval

In [None]:
# Load trained agent and evaluate 1000 games
eval_env = BattleEnvironment(show=False, hit_base_reward=cf['hit_base_reward'], hit_plane_reward=cf['hit_plane_reward'], miss_punishment=cf['miss_punishment'], 
    too_long_punishment=cf['too_long_punishment'], lose_punishment=cf['lose_punishment'])
model = PPO.load(f"{FOLDER}/final_model_PPO")
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=1000, deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")
print(eval_env.wins())
print(f"\n---EVALUATION PPO---\n{eval_env.wins()}\n", file=file)

# Evaluate with visuals (10 games)
eval_env = BattleEnvironment(show=True, hit_base_reward=cf['hit_base_reward'], hit_plane_reward=cf['hit_plane_reward'], miss_punishment=cf['miss_punishment'], 
    too_long_punishment=cf['too_long_punishment'], lose_punishment=cf['lose_punishment'], fps=30)
model = PPO.load(f"{FOLDER}/final_model")
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")
print(eval_env.wins())

DQN eval

In [None]:
# Load trained agent and evaluate 1000 games
eval_env = BattleEnvironment(show=False, hit_base_reward=cf['hit_base_reward'], hit_plane_reward=cf['hit_plane_reward'], miss_punishment=cf['miss_punishment'], 
    too_long_punishment=cf['too_long_punishment'], lose_punishment=cf['lose_punishment'])
model = DQN.load(f"{FOLDER}/final_model_DQN")
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=1000, deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")
print(eval_env.wins())
print(f"\n---EVALUATION DQN---\n{eval_env.wins()}\n", file=file)

# Evaluate with visuals (10 games)
eval_env = BattleEnvironment(show=True, hit_base_reward=cf['hit_base_reward'], hit_plane_reward=cf['hit_plane_reward'], miss_punishment=cf['miss_punishment'], 
    too_long_punishment=cf['too_long_punishment'], lose_punishment=cf['lose_punishment'], fps=30)
model = DQN.load(f"{FOLDER}/final_model")
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")
print(eval_env.wins())