In [None]:
gameFolder = "DOA++-MAME"

In [None]:
import sys, os
import time
timeDepSeed = int((time.time()-int(time.time()-0.5))*1000)

sys.path.append(os.path.join(os.path.abspath(''), '../'))   
sys.path.append(os.path.join(os.path.abspath(''), '../../../games',gameFolder))   

tensorBoardFolder = "./ppo2_TB_CustXceptLstmNoShared/"
modelFolder = "ppo2_Model_CustXceptLstmNoShared/"

os.makedirs(modelFolder, exist_ok=True)

In [None]:
from makeDiambraEnv import *

import tensorflow as tf

from customPolicies.customXceptLstmPolicyNoShared import *

from stable_baselines import PPO2
from stable_baselines.common.evaluation import evaluate_policy
from stable_baselines.common.callbacks import BaseCallback

In [None]:
class AutoSave(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)

    :param check_freq: (int)
    :param save_path: (str) Path to the folder where the model will be saved.
    :param verbose: (int)
    """
    def __init__(self, check_freq: int, save_path: str, verbose=1):
        super(AutoSave, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path_base = save_path + 'autoSave_'

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:
            # Example for saving best model
            if self.verbose > 0:
                print("Saving new best model to {}".format(self.save_path_base))
            # Save the agent
            self.model.save(self.save_path_base+str(self.n_calls)+"steps_action+_add")

        return True

In [None]:
diambraKwargs = {}
diambraKwargs["roms_path"] = "../../../roms/MAMEToolkit/roms/"
diambraKwargs["binary_path"] = "../../../../customMAME/"
diambraKwargs["player"] = "P1"
diambraKwargs["frame_ratio"] = 3
diambraKwargs["render"] = False
#diambraKwargs["throttle"] = False
#diambraKwargs["sound"] = False 
#diambraKwargs["character"] ="Random"
diambraKwargs["character"] = "Kasumi"

wrapperKwargs = {}
wrapperKwargs["frame_stack"] = 1
wrapperKwargs["clip_rewards"] = False
wrapperKwargs["normalize_rewards"] = True
wrapperKwargs["scale"] = True
wrapperKwargs["scale_mod"] = -1
wrapperKwargs["hwc_obs_resize"] = [256, 256, 1]

#keyToAdd = None
keyToAdd = []
keyToAdd.append("actionsBuf")
#keyToAdd.append("player")
keyToAdd.append("healthP1")
keyToAdd.append("healthP2")
keyToAdd.append("positionP1")
keyToAdd.append("positionP2")
#keyToAdd.append("winsP1")
#keyToAdd.append("winsP2")

numEnv=1

env = make_diambra_env(diambraMame, env_prefix="Train", num_env=numEnv, seed=timeDepSeed, 
                       continue_game = 0, diambra_kwargs = diambraKwargs, 
                       wrapper_kwargs = wrapperKwargs, key_to_add = keyToAdd)

In [None]:
print("Obs_space = ", env.observation_space)
print("Obs_space type = ", env.observation_space.dtype)
print("Obs_space high = ", env.observation_space.high)
print("Obs_space low = ", env.observation_space.low)

In [None]:
print("Act_space = ", env.action_space)
print("Act_space type = ", env.action_space.dtype)
print("Act_space n = ", env.action_space.n)

In [None]:
# Policy arguments
policyKwargs={}
# Frame observation
policyKwargs["scale_in"] = False
policyKwargs["n_lstm"] = 256
policyKwargs["cnn_input"] = wrapperKwargs["hwc_obs_resize"] # [256, 256, 1]
policyKwargs["cnn_embeddings"] = 512

# Additional info observation
policyKwargs["n_add_info"] = 148
policyKwargs["layers"] = [128, 64]

# Policy and value net part
policyKwargs["layers_policy"] = [64, 64]
policyKwargs["layers_value"] = [64, 64]

# Model arguments to override load
#customObjects={}
#customObjects["learning_rate"]=5e-5 # Changed from 2.5e-4 to 5e-5 @ ~6.6M iter

# Initialize the model, 1 env
model = PPO2(CustomXceptLstmPolicyNoShared, env, nminibatches=numEnv, verbose=1, n_steps=32,
             learning_rate=2.5e-4, tensorboard_log=tensorBoardFolder, gamma = 0.94, 
             policy_kwargs=policyKwargs)

#OR

# Load the trained agent, 1 env
#model = PPO2.load(modelFolder+"7_3Msteps_action+_add", env=env, custom_objects=customObjects,
#                  tensorboard_log=tensorBoardFolder, 
#                  policy_kwargs=policyKwargs, gamma = 0.94)

In [None]:
print(model.learning_rate)

In [None]:
# Create the callback: autosave every 100000 steps
autoSaveCallback = AutoSave(check_freq=100000, save_path=modelFolder+"start_")

# Train the agent
time_steps = 10000000
model.learn(total_timesteps=time_steps, callback=autoSaveCallback)

In [None]:
# Save the agent
model.save(modelFolder+"6_6Msteps_action+_add")

In [None]:
observation = env.reset()
states = None

cumulativeEpRew = 0.0
cumulativeEpRewAll = []
cumulativeTotRew = 0.0

maxNumEp = 100
currNumEp = 0

while currNumEp < maxNumEp:

    action, states = model.predict(observation, states, deterministic=False)
    action_prob = model.action_probability(observation, states)
    #print("Action probabilities = ", action_prob)
    #print("Max action = ", np.argmax(action_prob))
    #print("Action = ", action)
    
    observation, reward, done, info = env.step(action)
    
    cumulativeEpRew += reward
    
    if np.any(done):
        currNumEp += 1
        print("Ep. # = ", currNumEp)
        print("Ep. Cumulative Rew # = ", cumulativeEpRew)
        sys.stdout.flush()
        cumulativeEpRewAll.append(cumulativeEpRew)
        cumulativeTotRew += cumulativeEpRew
        cumulativeEpRew = 0.0

        observation = env.reset()
        states = None

print("Mean cumulative reward = ", cumulativeTotRew/maxNumEp)    
print("Mean cumulative reward = ", np.mean(cumulativeEpRewAll))    
print("Std cumulative reward = ", np.std(cumulativeEpRewAll))   
    
env.close()

In [None]:
observation = env.reset()
states = None

while True:

    action, states = model.predict(observation, states, deterministic=False)
    action_prob = model.action_probability(observation, states)
    #print("Action probabilities = ", action_prob)
    #print("Max action = ", np.argmax(action_prob))
    #print("Action = ", action)
    #input("Pausa")
    
    observation, reward, done, info = env.step(action)
    if done:
        observation = env.reset()
        states = None
        
env.close()