____
# Pac-Man AI ~ Research Notebook
____

## Important

1. All manually set hyperparamters must be configured manually in the optuna optimization step, as well as in the final training step
___

## 1. Dependencies

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import time
import curses
import joblib

In [None]:
from stable_baselines3 import PPO
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

In [None]:
import optuna
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_param_importances

In [None]:
from pacman_gymenv import PacmanEnvironment

## General configuration

In [None]:
N_CPU = 8

## Hyperparamter Optimization with Optuna

In [None]:
# config
MAP                     = "pacman/maps/lv1.txt"
PACMAN_ENV              = PacmanEnvironment(pacmanmap=MAP)

                        # lower-bound, upper-bound
TUNING_HORIZON          = [32,          5000            ]
TUNING_MINIBATCH_RANGE  = [4,           4096            ]
TUNING_EPOCHS           = [3,           30              ]
TUNING_CLIP_RANGE       = [0.1,         0.3             ]
TUNING_GAMMA            = [0.8,         0.9997          ]
TUNING_GAE              = [0.9,         1.0             ]
TUNING_VF               = [0.5,         1.0             ]
TUNING_EF               = [0.0,         0.01            ]
TUNING_LEARNINGRATE     = [5e-6,        0.003           ]

TUNING_STEPS            = 32
TUNING_TIMESTEPS        = 4096*8

lvname = MAP.replace('/','_').split('.', -1)[0]

In [None]:
# optuna objective function
def objective_ppo(trial):
        verbose         = 0
        seed            = 0
        # n_steps       = trial.suggest_int(    'n_steps',              TUNING_HORIZON[0],              TUNING_HORIZON[1])
        # batch_size    = trial.suggest_int(    'batch_size',           TUNING_MINIBATCH_RANGE[0],      TUNING_MINIBATCH_RANGE[1])
        n_epochs        = trial.suggest_int(    'n_epochs',             TUNING_EPOCHS[0],               TUNING_EPOCHS[1])
        clip_range      = trial.suggest_float(  'clip_range',           TUNING_CLIP_RANGE[0],           TUNING_CLIP_RANGE[1]);
        gamma           = trial.suggest_float(  'gamma',                TUNING_GAMMA[0],                TUNING_GAMMA[1])
        # gae_labmda    = trial.suggest_float(  'gae_lambda',           TUNING_GAE[0],                  TUNING_GAE[1])
        # vf_coef       = trial.suggest_float(  'vf_coef',              TUNING_VF[0],                   TUNING_VF[1])
        ent_coef        = trial.suggest_float(  'ent_coef',             TUNING_EF[0],                   TUNING_EF[1])
        learning_rate   = trial.suggest_float(  'learning_rate',        TUNING_LEARNINGRATE[0],         TUNING_LEARNINGRATE[1])

        env = SubprocVecEnv([lambda: Monitor(PACMAN_ENV) for i in range(N_CPU)])
        model = PPO(
                policy          = "MlpPolicy",
                env             = env,
                learning_rate   = learning_rate,
                n_steps         = 2048,
                batch_size      = 64,
                n_epochs        = n_epochs,
                gamma           = gamma,
                gae_lambda      = 0.95,
                clip_range      = clip_range,
                ent_coef        = ent_coef,
                vf_coef         = 0.5,
                verbose         = 0,
                seed            = 0,
                policy_kwargs   = dict(net_arch=[64, 64])
                )
        model.learn(total_timesteps=TUNING_TIMESTEPS)

        reward_mean, _ = evaluate_policy(model, env)
        return reward_mean

In [None]:
# optimizeb
study = optuna.create_study(direction="maximize")
study.optimize(objective_ppo, n_trials=TUNING_STEPS, gc_after_trial=True)
studyname = f"{lvname}_{TUNING_STEPS}_{TUNING_TIMESTEPS}_{'_'.join([s.replace('_','') for s in study.best_params.keys()])}"

In [None]:
# save the study
joblib.dump(study, f"optuna_studies/{studyname}.pkl")

In [None]:
study.best_params, study.best_value

In [None]:
fig = plot_optimization_history(study)
fig.show()
fig.write_image(f"optuna_studies/{studyname}_optimization_hist.png")
fig.write_image(f"optuna_studies/{studyname}_optimization_hist.eps")

In [None]:
fig = plot_param_importances(study)
fig.show()
fig.write_image(f"optuna_studies/{studyname}_param_importances.png")
fig.write_image(f"optuna_studies/{studyname}_param_importances.eps")


## Final Training

In [None]:
# config
MAP                     = "pacman/maps/lv1.txt"
PACMAN_ENV              = PacmanEnvironment(pacmanmap=MAP)
OPTUNA_STUDY_PATH       = "optuna_studies/pacman_maps_lv2_32_8192_nepochs_cliprange_gamma_entcoef_learningrate.pkl"
MODEL_TIMESTEPS         = 4096*32*8

optuna_study            = joblib.load(OPTUNA_STUDY_PATH)
lvname                  = MAP.replace('/','_').split('.', -1)[0]

In [None]:
# train
env = SubprocVecEnv([lambda: Monitor(PACMAN_ENV) for i in range(N_CPU)])
model = PPO(
    policy          = "MlpPolicy",
    env             = env,
    batch_size      = 64,
    n_steps         = 2048,
    gae_lambda      = 0.95,
    vf_coef         = 0.5,
    verbose         = 0,
    seed            = 0,
    **optuna_study.best_params,
    policy_kwargs   = dict(net_arch=[64, 64])
)
model.learn(MODEL_TIMESTEPS, progress_bar=True)

In [None]:
model.save(f"models/{lvname}_{MODEL_TIMESTEPS}_{OPTUNA_STUDY_PATH.replace('/','_').split('.', -1)[0]}")

## Analysis

In [None]:
MAP        = "pacman/maps/lv1.txt"
MODEL_PATH = "models/pacman_maps_lv1_131072_optuna_studies_pacman_maps_lv1_32_8192_nepochs_cliprange_gamma_entcoef_learningrate.zip"
PACMAN_ENV = PacmanEnvironment(pacmanmap=MAP)

modelname = MODEL_PATH.replace('/','_').split('.',-1)[0]
lvname = MAP.replace('/','_').split('.', -1)[0]
model  = PPO.load(MODEL_PATH)

In [None]:
# function to make plotting easier
def plot(path:str, title:str, xname:str, yname:str, ax) -> None:
    ax.set_axisbelow(True)
    ax.yaxis.grid(color='silver')
    plt.subplots_adjust(bottom=0.15)
    plt.xlabel(xname) 
    plt.ylabel(yname)
    plt.title(title)
    plt.legend()
    plt.savefig(f"{path}.eps", format='eps')
    plt.savefig(f"{path}.png", format='png')
    plt.show()

In [None]:
data = []
pacmanenv = PACMAN_ENV
for i in range(512):
    finished = False
    framecnt = 0
    obs = pacmanenv.reset()
    while not finished:
        framecnt += 1
        action, _states = model.predict(obs)
        obs, rewards, finished, info = pacmanenv.step(action)
    data.append([1 if pacmanenv.game.pelletcount==0 else 0, pacmanenv.game.score, framecnt])

In [None]:
# only run this if you want to use data from a file instead of a model

#data = np.genfromtxt(
#   "saved_data/runlv3_2.txt", delimiter=", ", dtype=np.int16)
#lvname = "pacman_maps_lv3.txt"
#modelname = "lv3_2"

In [None]:
import os

folder = f"./analysis/{modelname}" 
if not os.path.exists(folder):
    os.mkdir(folder)
folder

In [None]:
data = np.array(data)
data[:8], "...", data[-8:]

In [None]:
# calculate win percentage
w  = data[:,0]
wc = np.count_nonzero(w == 1)
wp = wc/len(w)
print(f"   win percentage: {wp*100:20}%")

In [None]:
# calculate score data
s     = data[:,1]
s_mn  = np.mean(s)
s_std = np.std(s)
s_med = np.median(s)
print(f"  score mean:      {s_mn:20}")
print(f"  score std:       {s_std:20.2f}")
print(f"  score median:    {s_med:20}")

In [None]:
# calculate time data (counted in steps taken)
t     = data[:,2]
t_mn  = np.mean(t)
t_std = np.std(t)
t_med = np.median(s)
print(f"  time mean:       {t_mn:20}")
print(f"  time std:        {t_std:20.2f}")
print(f"  time median:     {t_med:20}")

In [None]:
f = open(f"{folder}/stats.txt", "w")
f.write(f"   win percentage: {wp*100:20}%\n")
f.write(f"  score mean:      {s_mn:20}\n")
f.write(f"  score std:       {s_std:20.2f}\n")
f.write(f"  score median:    {s_med:20}\n")
f.write(f"  time mean:       {t_mn:20}\n")
f.write(f"  time std:        {t_std:20.2f}\n")
f.write(f"  time median:     {t_med:20}\n")
f.close()

In [None]:
fig,ax = plt.subplots()
plt.plot(s, label=f"{lvname}")
plt.plot(np.sort(s), label=f"{lvname} sorted")
plot(f"{folder}/score", "score", "run", "score", ax)

In [None]:
fig,ax = plt.subplots()
plt.plot(t, label=f"{lvname}")
plt.plot(np.sort(t), label=f"{lvname} sorted")
plot(f"{folder}/time", "survival time", "run", "time (steps taken)", ax)

In [None]:
fig,ax = plt.subplots()
plt.plot(s/t, label=f"{lvname}")
plt.plot(np.sort(s/t), label=f"{lvname} sorted")
plot(f"{folder}/score_per_step", "score per step for each run", "run", "score/step", ax)

In [None]:
t_to_s = np.array(sorted(zip(t, s), key=lambda x:x[0]))

fig,ax = plt.subplots()
plt.plot(t_to_s[:,0]/8, t_to_s[:,1], label=f"{lvname}")
plot(f"{folder}/time_to_score", "time mapped to score", "time (s)", "score", ax)