To create a virtual environment and install dependencies, run: `pants venv`. This creates a virtual environment with all the needed deps. Then activate the virtual environment by runnin `dist/export/python/virtualenvs/python-default/3.8.16/bin/source`.

The next few blocks set-up python paths on dependent modules and imports.

In [None]:
import sys
sys.path.append("../../schemas/gen/py")

%load_ext autoreload
%autoreload 2

In [None]:
from gym.wrappers import TimeLimit
import gym
from stable_baselines3 import PPO
from stable_baselines3.ppo.policies import MultiInputPolicy
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common import results_plotter
from tqdm.rich import trange
from datetime import datetime
import os

from pong.env.rewards import RewardConfig
from pong.env.pong_env import PongEnv, EnvConfig, PaddleConfig
from pong.env.opponent import StaticOpponent, ModelOpponent
from pong.env.spaces import ActionType
from pong.renderer.pygame_renderer import PygameRenderer
from pong.renderer.event_renderer import EventRenderer, FileEventWriter
import pong.services.evaluation_runner import EvaluationRunner, WrappedEnv
from pong.simulation.config import SimulationConfig
from pong.env.additional_logs_callback import AdditionalLogsCallback


# Shared variables and utils
Execute this block to allow partial execution of the notebook from a cell bellow

### Baseline player config

The baseline player has genome attributes that puts it at an advantage over most other as we want a good performing agent.

In [None]:
# Shared variables and configurations that could be used downstream. We co-locate these so this block could be executed
# and partial execution from a cell bellow works.

width = 800
height = 400

# Change this to use a clean output dir
experiment_id = 7
experiment = f"baseline/{experiment_id}"



log_dir = f".logs/{experiment}"
tb_log_dir = f".logs/{experiment}_tb"

os.makedirs(experiment, exist_ok = True)
os.makedirs(tb_log_dir, exist_ok = True)
os.makedirs(f".models/{experiment}", exist_ok = True)


def pygame_renderer() -> PygameRenderer:
    return PygameRenderer(width, height, 60)

def model_dir_for(name: str) -> str:
    return f".models/{experiment}/{name}"

def load_model(name: str) -> PPO:
    model = PPO.load(model_dir_for(name))
    model.tensorboard_log = tb_log_dir
    return model

def evaluate(model: PPO, env: gym.Env, config: SimulationConfig, iterations = 100):
    EvaluationRunner(pygame_renderer).evaluate(model, env, config, 100)
    
def load_and_evaluate(name: str, env: PongEnv, config: SimulationConfig, iterations = 100):
    model = load_model(name)
    model.set_env(env)
    env.disable_training()

    EvaluationRunner(pygame_renderer()).evaluate(model, env, config, 100)
    
def load_and_evaluate_models(name_l: str, name_r: str, env: PongEnv, config: SimulationConfig, iterations = 100):
    model_l = load_model(name_l)
    model_r = load_model(name_r)

    env.set_opponent(ModelOpponent(model_r))
    env.disable_training()
    model_l.set_env(env)

    EvaluationRunner(pygame_renderer()).evaluate(model_l, env, config, 100)

# helper for running the training loop but keep checkpointing along the way
def learn(model: PPO, name: str, env: PongEnv, iterations: int):
    wrapped = TimeLimit(env, 60 * 60 * 1)
    model.set_env(wrapped)
    
    # store every 10k steps
    for r in range(0, iterations, 10_000):
        model.learn(total_timesteps=10_000, tb_log_name=name, callback=AdditionalLogsCallback(), reset_num_timesteps=False)
        model.save(model_dir_for(name))
    
    
    
# =========== Baseline player config ==============
# The baseline player has genome attributes that puts it at an advantage over most other as we want a
# good performing agent. 
player_paddle = PaddleConfig(width = 100, strength = 25, endurance=10, max_speed=1000,)

# A static paddle that does not move
wall_paddle = PaddleConfig(width = 1000, strength = 1000, endurance=3, max_speed=500,)

# Play against the wall

Start with a completely random model and train against a wall as we don't have any other baseline, This way the agents learns to follow the ball and defend on the early stages.
Reward for near misses as this is a faster way for the agent to learn to 

In [None]:
reward_config = RewardConfig(
    win_round = 10,
    lose_round = -5,
    paddle_hit = 5,
    near_miss_multiplier = 1,
    near_miss_exponent = 5,
    near_miss_min_distance = height / 2,
    survival_reward_multiplier = 1,
    endurance_penalty_multiplier = -1,
)

play_against_wall_config = EnvConfig(
    paddle_l = player_paddle,
    paddle_r = wall_paddle,
    width = width,
    height = height,
)

play_against_wall_env = PongEnv(play_against_wall_config, reward_config)
play_against_wall_env.enable_training()
play_against_wall_env.set_opponent(StaticOpponent(ActionType.STOP, 0.0))

model = PPO(MultiInputPolicy, play_against_wall_env, verbose=0, device="cpu", tensorboard_log=tb_log_dir, )

In [None]:
learn(model, "wall_near_miss", play_against_wall_env, 10_000_000)

In [None]:
model.save(model_dir_for("wall_near_miss"))

In [None]:
load_and_evaluate("wall_near_miss", play_against_wall_env, play_against_wall_config)

# Play against itself to improve on defense and learn attacking moves

The use the same model to play with it-self. The wall helps to learn defense but does not allow to learn attacking strategies. Lest setup two reward confics so we could get two model behaviors, one model that is rewarded for defense and another to attack.

In [None]:
model = load_model("wall_near_miss")

# still no reward for winning, we need to defend a moving target first
reward_config = RewardConfig(
    win_round = -10,
    lose_round = -10,
    paddle_hit = 5,
    near_miss_multiplier = 1,
    near_miss_exponent = 5,
    near_miss_min_distance = height / 10,
    survival_reward_multiplier = 1,
    endurance_penalty_multiplier = -1,
)

env_config = EnvConfig(
    paddle_l = player_paddle,
    paddle_r = player_paddle,
    width = width,
    height = height,
)

self_env = PongEnv(env_config, reward_config)
self_env.enable_training()
model.set_env(self_env)

In [None]:
opponent = model

self_env.set_opponent(ModelOpponent(opponent))
learn(model, "play_against_self_opponent_3", self_env, 1_000_000)

In [None]:
model.save(model_dir_for("play_against_self_opponent_3"))

In [None]:
load_and_evaluate_models("play_against_self_opponent_3", "play_against_self_opponent_3", self_env, env_config)

In [None]:
model = load_model("play_against_self_opponent_3")

# slowly start rewarding for winning rounds
reward_config = RewardConfig(
    win_round = 10,
    lose_round = -10,
    paddle_hit = 10,
    near_miss_multiplier = 0,
    near_miss_exponent = 4,
    near_miss_min_distance = height / 10,
    survival_reward_multiplier = 0,
    endurance_penalty_multiplier = -0,
)

env_config = EnvConfig(
    paddle_l = player_paddle,
    paddle_r = player_paddle,
    width = width,
    height = height,
)

self_env = PongEnv(env_config, reward_config)
model.set_env(self_env)

model.save(model_dir_for("attack_opponent"))
for i in range(500):
    opponent = load_model("attack_opponent")
    self_env.set_opponent(ModelOpponent(opponent))
    
    learn(model, "attack_opponent", self_env, 10_000)
    model.save(model_dir_for("attack_opponent"))


In [None]:
model.save(model_dir_for("attack_opponent"))

In [None]:
load_and_evaluate_models("attack_opponent", "play_against_self_opponent_3", self_env, env_config)