In [None]:
gameId = "doapp"

In [None]:
import sys, os
import time

timeDepSeed = int((time.time()-int(time.time()-0.5))*1000)

sys.path.append(os.path.join(os.path.abspath(''), '../'))   
sys.path.append(os.path.join(os.path.abspath(''), '../../../games'))   

tensorBoardFolder = "./{}_ppo2_TB_CustCnn_bL_mD/".format(gameId)
modelFolder = "./{}_ppo2_Model_CustCnn_bL_mD/".format(gameId)

os.makedirs(modelFolder, exist_ok=True)

In [None]:
from makeDiambraEnv import *

import tensorflow as tf

from customPolicies.utils import linear_schedule, AutoSave
from customPolicies.customCnnPolicy import *

from stable_baselines import PPO2

In [None]:
# Diambra environment kwargs
diambraKwargs = {}
diambraKwargs["roms_path"]   = "../../../roms/mame/"
diambraKwargs["binary_path"] = "../../../customMAME/"
diambraKwargs["frame_ratio"] = 6
diambraKwargs["render"]      = False

diambraKwargs["player"] = "Random" # P1 / P2

# Game dependent kwawrgs
if gameId == "doapp":
    diambraKwargs["difficulty"]  = 3 
    diambraKwargs["characters"] =["Kasumi", "Random"]
    diambraKwargs["charOutfits"] =[2, 2]
else:
    raise Exception("Game not implemented: {}".format(gameId))

# DIAMBRA gym kwargs
diambraGymKwargs = {}
diambraGymKwargs["P2brain"] = None
diambraGymKwargs["continue_game"] = 0.0
diambraGymKwargs["show_final"] = False
diambraGymKwargs["gamePads"] = [None, None]                                     
diambraGymKwargs["actionSpace"] = ["multiDiscrete", "multiDiscrete"]

# Gym Wrappers kwargs
wrapperKwargs = {}
wrapperKwargs["hwc_obs_resize"] = [256, 256, 1]
wrapperKwargs["normalize_rewards"] = True
wrapperKwargs["clip_rewards"] = False
wrapperKwargs["frame_stack"] = 6
wrapperKwargs["dilation"] = 1
wrapperKwargs["scale"] = True
wrapperKwargs["scale_mod"] = 0

# Additional observations
keyToAdd = []
keyToAdd.append("actionsBufP1") # env.actBufLen*(env.n_actions[0]+env.n_actions[1])
keyToAdd.append("ownHealth")    # 1
keyToAdd.append("oppHealth")    # 1
keyToAdd.append("ownPosition")  # 1
keyToAdd.append("ownPosition")  # 1
keyToAdd.append("stage")        # 1
keyToAdd.append("characters")   # 2*len(env.charNames)

numEnv=8

envId = gameId + "_Train"
env = make_diambra_env(diambraMame, env_prefix=envId, num_env=numEnv, seed=timeDepSeed, 
                       diambra_kwargs=diambraKwargs, diambra_gym_kwargs=diambraGymKwargs,
                       wrapper_kwargs=wrapperKwargs, key_to_add=keyToAdd, use_subprocess=True)

In [None]:
print("Obs_space = ", env.observation_space)
print("Obs_space type = ", env.observation_space.dtype)
print("Obs_space high = ", env.observation_space.high)
print("Obs_space low = ", env.observation_space.low)

In [None]:
print("Act_space = ", env.action_space)
print("Act_space type = ", env.action_space.dtype)
if diambraGymKwargs["actionSpace"][0] == "multiDiscrete":
    print("Act_space n = ", env.action_space.nvec)
else:
    print("Act_space n = ", env.action_space.n)

In [None]:
# Policy param
n_actions = env.get_attr("n_actions")[0]
actBufLen = env.get_attr("actBufLen")[0]

policyKwargs={}
policyKwargs["n_add_info"] = actBufLen*(n_actions[0]+n_actions[1]) + len(keyToAdd)-2 # No Char Info
policyKwargs["layers"] = [64, 64]

print("n_add_info =", policyKwargs["n_add_info"])

In [None]:
# PPO param
setGamma = 0.94
setLearningRate = linear_schedule(2.3e-4, 2.5e-6)
setClipRange = linear_schedule(0.185, 0.025)
setClipRangeVf = setClipRange

# Initialize the model, 1 env
#model = PPO2(CustCnnPolicy, env, verbose=1, 
#             gamma = setGamma, nminibatches=4, noptepochs=4, n_steps=128,
#             learning_rate=setLearningRate, cliprange=setClipRange, cliprange_vf=setClipRangeVf, 
#             tensorboard_log=tensorBoardFolder, policy_kwargs=policyKwargs)

#OR

# Load the trained agent
model = PPO2.load(modelFolder+"30_7M", env=env, tensorboard_log=tensorBoardFolder, 
                  policy_kwargs=policyKwargs, gamma = setGamma, learning_rate=setLearningRate, 
                  cliprange=setClipRange, cliprange_vf=setClipRangeVf)

In [None]:
print("Model discount factor = ", model.gamma)

In [None]:
# Create the callback: autosave every USER DEF steps
autoSaveCallback = AutoSave(check_freq=1000000, numEnv=numEnv, save_path=modelFolder+"0M_")

# Train the agent
time_steps = 36600000
model.learn(total_timesteps=time_steps, callback=autoSaveCallback)

In [None]:
# Save the agent
model.save(modelFolder+"3_4M")

## Evaluation

In [None]:
# Initialize a new evaluation environment
diambraKwargs["render"] = True

envId = gameId + "_Test"
env = make_diambra_env(diambraMame, env_prefix=envId, num_env=1, seed=timeDepSeed, 
                       diambra_kwargs=diambraKwargs, diambra_gym_kwargs=diambraGymKwargs,
                       wrapper_kwargs=wrapperKwargs, key_to_add=keyToAdd)

In [None]:
observation = env.reset()

cumulativeEpRew = 0.0
cumulativeEpRewAll = []
cumulativeTotRew = 0.0

maxNumEp = 100
currNumEp = 0

while currNumEp < maxNumEp:

    action = model.predict(observation, deterministic=True)
    #action_prob = model.action_probability(observation, states)
    #print("Action probabilities = ", action_prob)
    #print("Max action = ", np.argmax(action_prob))
    #print("Action = ", action)
    
    observation, reward, done, info = env.step(action[0])
    
    cumulativeEpRew += reward
    
    if np.any(done):
        currNumEp += 1
        print("Ep. # = ", currNumEp)
        print("Ep. Cumulative Rew # = ", cumulativeEpRew)
        cumulativeEpRewAll.append(cumulativeEpRew)
        cumulativeTotRew += cumulativeEpRew
        cumulativeEpRew = 0.0

print("Mean cumulative reward = ", cumulativeTotRew/maxNumEp)    
print("Mean cumulative reward = ", np.mean(cumulativeEpRewAll))    
print("Std cumulative reward = ", np.std(cumulativeEpRewAll))   
    
env.close()