In [None]:
gameId = "doapp"
#gameId = "sfiii3n"
#gameId = "tektagt"

In [None]:
from collections import namedtuple
# Algorithm options
algoOpt = namedtuple('algoOptions', 'character frameSize frameStack actionSpace attackButComb\
                                     frameRatio actionBufLen dilation smallCnn')
if gameId == "doapp":
    algoOpt.character = "Random"
elif gameId == "sfiii3n":
    algoOpt.character = "Random"
elif gameId == "tektagt":
    algoOpt.character = ["Random", "Random"]

algoOpt.frameRatio    = 6
algoOpt.frameSize     = 128
algoOpt.frameStack    = 4
algoOpt.dilation      = 1
algoOpt.actionSpace   = "discrete"
algoOpt.attackButComb = True
algoOpt.actionBufLen  = 12
algoOpt.smallCnn      = True

In [None]:
import sys, os, time
timeDepSeed = int((time.time()-int(time.time()-0.5))*1000)

sys.path.append(os.path.join(os.path.abspath(''), '../'))
sys.path.append(os.path.join(os.path.abspath(''), '../../gym/'))

specificAlgoOptId = str("".join(algoOpt.character)) + "x" + str(algoOpt.frameRatio) + "_" +\
                    str(algoOpt.frameSize) + "x" + str(algoOpt.frameStack) + "x" +\
                    str(algoOpt.dilation) + "x" + str(algoOpt.smallCnn) + "_" +\
                    algoOpt.actionSpace + "x" + str(algoOpt.attackButComb) + "x" + str(algoOpt.actionBufLen)
                    
tensorBoardFolder = "./{}_ppo2_selfPlay_TB_{}/".format(gameId, specificAlgoOptId)
modelFolder = "./{}_ppo2_selfPlay_Model_{}".format(gameId, specificAlgoOptId)

os.makedirs(modelFolder, exist_ok=True)

In [None]:
from makeStableBaselinesEnv import makeStableBaselinesEnv

import tensorflow as tf

from sbUtils import linear_schedule, AutoSave, UpdateRLPolicyWeights
from customPolicies.customCnnPolicy import CustCnnPolicy, local_nature_cnn_small
from policies import noActionPolicy, randomPolicy, RLPolicy

from stable_baselines import PPO2

In [None]:
# Diambra environment kwargs
diambraKwargs = {}
diambraKwargs["romsPath"]   = "../../roms/mame/"
diambraKwargs["binaryPath"] = "../../customMAME/"
diambraKwargs["frameRatio"] = algoOpt.frameRatio
diambraKwargs["render"]     = True

diambraKwargs["player"] = "P1P2" # 2P game   

# Game dependent kwawrgs    
diambraKwargs["characters"] =[algoOpt.character, algoOpt.character]    
diambraKwargs["charOutfits"] =[2, 2]

# DIAMBRA gym kwargs
diambraGymKwargs = {}
diambraGymKwargs["actionSpace"] = [algoOpt.actionSpace, algoOpt.actionSpace]
diambraGymKwargs["attackButCombinations"] = [algoOpt.attackButComb, algoOpt.attackButComb]

# Gym Wrappers kwargs
wrapperKwargs = {}
wrapperKwargs["noOpMax"] = 0
wrapperKwargs["hwcObsResize"] = [algoOpt.frameSize, algoOpt.frameSize, 1]
wrapperKwargs["normalizeRewards"] = True
wrapperKwargs["clipRewards"] = False
wrapperKwargs["frameStack"] = algoOpt.frameStack
wrapperKwargs["dilation"] = algoOpt.dilation
wrapperKwargs["scale"] = True
wrapperKwargs["scaleMod"] = 0

# Additional observations
keyToAdd = []
keyToAdd.append("actionsBuf") # env.actBufLen*(env.n_actions[0]+env.n_actions[1])

if gameId != "tektagt":                                                         
    keyToAdd.append("ownHealth")   # 1                                            
    keyToAdd.append("oppHealth")   # 1                                                
else:                                                                           
    keyToAdd.append("ownHealth1") # 1                                             
    keyToAdd.append("ownHealth2") # 1                                             
    keyToAdd.append("oppHealth1") # 1                                              
    keyToAdd.append("oppHealth2") # 1  
    keyToAdd.append("ownActiveChar") # 1
    keyToAdd.append("oppActiveChar") # 1
    
keyToAdd.append("ownPosition")     # 1
keyToAdd.append("oppPosition")     # 1
keyToAdd.append("ownChar")       # len(env.charNames)
keyToAdd.append("oppChar")       # len(env.charNames)

In [None]:
if gameId == "doapp":
    nActions = [9, 8] if algoOpt.attackButComb else [9, 4] 
else:
    raise ValueError("nActions not provided for selected gameId = {}".format(gameId))

In [None]:
noAction_policy = noActionPolicy(actionSpace="discrete")
random_policy = randomPolicy(nActions, actionSpace="discrete")

In [None]:
model = PPO2.load(os.path.join(modelFolder, str("_".join(keyToAdd))+"_0M"))

deterministicFlag = False                                                    
rl_policy = RLPolicy(model, deterministicFlag, nActions, name="PPO-0M", 
                     actionSpace=diambraGymKwargs["actionSpace"])   

In [None]:
numEnv=1

envId = gameId + "_Train"
env = makeStableBaselinesEnv(envId, numEnv, timeDepSeed, diambraKwargs, diambraGymKwargs,
                             wrapperKwargs, keyToAdd=keyToAdd, p2Mode="selfPlayVsRL", 
                             p2Policy=rl_policy, useSubprocess=False)

In [None]:
print("Obs_space = ", env.observation_space)
print("Obs_space type = ", env.observation_space.dtype)
print("Obs_space high = ", env.observation_space.high)
print("Obs_space low = ", env.observation_space.low)

In [None]:
print("Act_space = ", env.action_space)
print("Act_space type = ", env.action_space.dtype)
if diambraGymKwargs["actionSpace"][0] == "multiDiscrete":
    print("Act_space n = ", env.action_space.nvec)
else:
    print("Act_space n = ", env.action_space.n)

In [None]:
# Policy param
if nActions != env.get_attr("nActions")[0][0]:
    raise ValueError("nActions values wrong")
nActions = env.get_attr("nActions")[0][0]
actBufLen = env.get_attr("actBufLen")[0]
nChar = env.get_attr("numberOfCharacters")[0]

policyKwargs={}
policyKwargs["n_add_info"] = actBufLen*(nActions[0]+nActions[1]) + len(keyToAdd)-3 + 2*nChar
policyKwargs["layers"] = [64, 64]

if algoOpt.smallCnn:
    policyKwargs["cnn_extractor"] = local_nature_cnn_small

print("nActions =", nActions)
print("nChar =", nChar)
print("n_add_info =", policyKwargs["n_add_info"])

In [None]:
# PPO param
setGamma = 0.94
setLearningRate = linear_schedule(2.5e-4, 2.5e-6)
setClipRange = linear_schedule(0.15, 0.025)
setClipRangeVf = setClipRange

# Initialize the model
model = PPO2(CustCnnPolicy, env, verbose=1, 
             gamma = setGamma, nminibatches=4, noptepochs=4, n_steps=128,
             learning_rate=setLearningRate, cliprange=setClipRange, cliprange_vf=setClipRangeVf, 
             tensorboard_log=tensorBoardFolder, policy_kwargs=policyKwargs)

#OR

# Load the trained agent
#model = PPO2.load(os.path.join(modelFolder, str("_".join(keyToAdd))+"_0M"), env=env, tensorboard_log=tensorBoardFolder, 
#                  policy_kwargs=policyKwargs, gamma = setGamma, learning_rate=setLearningRate, 
#                  cliprange=setClipRange, cliprange_vf=setClipRangeVf)

In [None]:
print("Model discount factor = ", model.gamma)

In [None]:
# Create the callback: autosave every USER DEF steps
autoSaveCallback = AutoSave(check_freq=1000000, numEnv=numEnv,
                            save_path=os.path.join(modelFolder, str("_".join(keyToAdd))+"_0M_"))

prevAgentsSamplingDict = {"probability": 0.3,
                          "list":[os.path.join(modelFolder, str("_".join(keyToAdd))+"_0M")]}
upRLPolWCallback = UpdateRLPolicyWeights(check_freq=200000, numEnv=numEnv, save_path=modelFolder,
                                         prevAgentsSampling=prevAgentsSamplingDict)

# Train the agent
timeSteps = 10000000
model.learn(total_timesteps=timeSteps, callback=[autoSaveCallback, upRLPolWCallback])

In [None]:
# Save the agent
model.save(os.path.join(modelFolder, str("_".join(keyToAdd))+"_0M"))