In [None]:
gameId = "doapp"
#gameId = "sfiii3n"
#gameId = "tektagt"

In [None]:
import sys, os
import time
timeDepSeed = int((time.time()-int(time.time()-0.5))*1000)

sys.path.append(os.path.join(os.path.abspath(''), '../'))     

tensorBoardFolder = "./{}_ppo2_TB_CustCnnSmall_bL_d_noComb/".format(gameId)
modelFolder = "./{}_ppo2_Model_CustCnnSmall_bL_d_noComb/".format(gameId)

os.makedirs(modelFolder, exist_ok=True)

In [None]:
from makeStableBaselinesEnv import makeStableBaselinesEnv

import tensorflow as tf

from customPolicies.utils import linear_schedule, AutoSave
from customPolicies.customCnnPolicy import *

from stable_baselines import PPO2

In [None]:
# Diambra environment kwargs
diambraKwargs = {}
diambraKwargs["romsPath"]   = "../../roms/mame/"
diambraKwargs["binaryPath"] = "../../customMAME/"
diambraKwargs["frameRatio"] = 6
diambraKwargs["render"]     = False

diambraKwargs["player"] = "Random" # P1 / P2

# Game dependent kwawrgs
if gameId == "doapp":
    diambraKwargs["difficulty"]  = 3 
    diambraKwargs["characters"] =["Kasumi", "Kasumi"]
elif gameId == "sfiii3n":
    diambraKwargs["difficulty"]  = 6
    diambraKwargs["characters"] =["Ryu", "Ryu"]
elif gameId == "tektagt":
    diambraKwargs["difficulty"]  = 6
    diambraKwargs["characters"] =[["Jin", "Yoshimitsu"], ["Jin", "Yoshimitsu"]]
else:
    raise Exception("Game not implemented: {}".format(gameId))
    
diambraKwargs["charOutfits"] =[2, 2]

# DIAMBRA gym kwargs
diambraGymKwargs = {}
diambraGymKwargs["showFinal"] = False                                     
diambraGymKwargs["continueGame"] = 0.0 # If < 0.0 means number of continues
diambraGymKwargs["actionSpace"] = "discrete"
diambraGymKwargs["attackButCombinations"] = False
diambraGymKwargs["actBufLen"] = 12

# Gym Wrappers kwargs
wrapperKwargs = {}
wrapperKwargs["noOpMax"] = 0
wrapperKwargs["hwcObsResize"] = [128, 128, 1]
wrapperKwargs["normalizeRewards"] = True
wrapperKwargs["clipRewards"] = False
wrapperKwargs["frameStack"] = 4
wrapperKwargs["dilation"] = 1
wrapperKwargs["scale"] = True
wrapperKwargs["scaleMod"] = 0

# Additional observations
keyToAdd = []
keyToAdd.append("actionsBuf") # env.actBufLen*(env.n_actions[0]+env.n_actions[1])

if gameId != "tektagt":                                                         
    keyToAdd.append("ownHealth")   # 1                                            
    keyToAdd.append("oppHealth")   # 1                                                
else:                                                                           
    keyToAdd.append("ownHealth1") # 1                                             
    keyToAdd.append("ownHealth2") # 1                                             
    keyToAdd.append("oppHealth1") # 1                                              
    keyToAdd.append("oppHealth2") # 1  
    keyToAdd.append("ownActiveChar") # 1
    keyToAdd.append("oppActiveChar") # 1
    
keyToAdd.append("ownPosition")     # 1
keyToAdd.append("oppPosition")     # 1
keyToAdd.append("stage")           # 1
keyToAdd.append("ownChar")       # len(env.charNames)
keyToAdd.append("oppChar")       # len(env.charNames)

numEnv=8

envId = gameId + "_Train"
env = makeStableBaselinesEnv(envId, numEnv, timeDepSeed, diambraKwargs, diambraGymKwargs,
                             wrapperKwargs, keyToAdd=keyToAdd, useSubprocess=True)

In [None]:
print("Obs_space = ", env.observation_space)
print("Obs_space type = ", env.observation_space.dtype)
print("Obs_space high = ", env.observation_space.high)
print("Obs_space low = ", env.observation_space.low)

In [None]:
print("Act_space = ", env.action_space)
print("Act_space type = ", env.action_space.dtype)
if diambraGymKwargs["actionSpace"] == "multiDiscrete":
    print("Act_space n = ", env.action_space.nvec)
else:
    print("Act_space n = ", env.action_space.n)

In [None]:
# Policy param
nActions = env.get_attr("nActions")[0]
actBufLen = env.get_attr("actBufLen")[0]

policyKwargs={}
policyKwargs["n_add_info"] = actBufLen*(nActions[0]+nActions[1]) + len(keyToAdd)-3 # No Char Info
policyKwargs["layers"] = [64, 64]

policyKwargs["cnn_extractor"] = local_nature_cnn_small

print("n_add_info =", policyKwargs["n_add_info"])

In [None]:
# PPO param
setGamma = 0.94
#setLearningRate = linear_schedule(2.5e-4, 2.5e-6)
setLearningRate = linear_schedule(5.0e-5, 2.5e-6)
#setClipRange = linear_schedule(0.15, 0.025)
setClipRange = linear_schedule(0.075, 0.025)
setClipRangeVf = setClipRange

# Initialize the model
model = PPO2(CustCnnPolicy, env, verbose=1, 
             gamma = setGamma, nminibatches=4, noptepochs=4, n_steps=128,
             learning_rate=setLearningRate, cliprange=setClipRange, cliprange_vf=setClipRangeVf, 
             tensorboard_log=tensorBoardFolder, policy_kwargs=policyKwargs)

#OR

# Load the trained agent
#model = PPO2.load(os.path.join(modelFolder, "9M"), env=env, tensorboard_log=tensorBoardFolder, 
#                  policy_kwargs=policyKwargs, gamma = setGamma, learning_rate=setLearningRate, 
#                  cliprange=setClipRange, cliprange_vf=setClipRangeVf)

In [None]:
print("Model discount factor = ", model.gamma)

In [None]:
# Create the callback: autosave every USER DEF steps
autoSaveCallback = AutoSave(check_freq=1000000, numEnv=numEnv, save_path=os.path.join(modelFolder,"9M_"))

# Train the agent
time_steps = 10000000
model.learn(total_timesteps=time_steps, callback=autoSaveCallback)

In [None]:
# Save the agent
model.save(os.path.join(modelFolder, "19M"))

## Evaluation

In [None]:
# Initialize a new evaluation environment
diambraKwargs["render"] = True

envId = gameId + "_Test"
env = make_diambra_env(diambraMame, env_prefix=envId, num_env=1, seed=timeDepSeed, 
                       diambra_kwargs=diambraKwargs, diambra_gym_kwargs=diambraGymKwargs,
                       wrapper_kwargs=wrapperKwargs, key_to_add=keyToAdd)

In [None]:
observation = env.reset()

cumulativeEpRew = 0.0
cumulativeEpRewAll = []

maxNumEp = 10
currNumEp = 0

while currNumEp < maxNumEp:

    action = model.predict(observation, deterministic=True)
    #action_prob = model.action_probability(observation, states)
    #print("Action probabilities = ", action_prob)
    #print("Max action = ", np.argmax(action_prob))
    #print("Action = ", action)
    
    observation, reward, done, info = env.step(action[0])
    
    cumulativeEpRew += reward
    
    if np.any(done):
        currNumEp += 1
        print("Ep. # = ", currNumEp)
        print("Ep. Cumulative Rew # = ", cumulativeEpRew)
        cumulativeEpRewAll.append(cumulativeEpRew)
        cumulativeTotRew += cumulativeEpRew
        cumulativeEpRew = 0.0
    
env.close()

In [None]:
print("Cumulative reward = ", cumulativeEpRewAll)    
print("Mean cumulative reward = ", np.mean(cumulativeEpRewAll))    
print("Std cumulative reward = ", np.std(cumulativeEpRewAll))  