In [None]:
gameId = "doapp"

In [None]:
from collections import namedtuple
# Algorithm options
algoOpt = namedtuple('algoOptions', 'dataFolder totalCpus smallCnn')

algoOpt.totalCpus    = 2
algoOpt.smallCnn     = True
algoOpt.dataFolder   = "../../test/data/Discrete/Normal"
#algoOpt.dataFolder   = "../../test/data/Discrete/HC"
#algoOpt.dataFolder   = "../../test/data/MultiDiscrete/Normal"
#algoOpt.dataFolder   = "../../test/data/MultiDiscrete/HC"

In [None]:
import sys, os, time, cv2
from os import listdir
import numpy as np

timeDepSeed = int((time.time()-int(time.time()-0.5))*1000)

sys.path.append(os.path.join(os.path.abspath(''), '../.')) 

tensorBoardFolder = "./{}_ppo2_imitationLearning_TB_{}/".format(gameId)
modelFolder = "./{}_ppo2_imitationLearning_Model_{}".format(gameId)

os.makedirs(modelFolder, exist_ok=True)

In [None]:
 # Show files in folder
base_path = os.path.abspath('')
trajRecFolder = os.path.join(base_path, algoOpt.dataFolder)
trajectoriesFiles = [os.path.join(trajRecFolder, f) for f in listdir(trajRecFolder) if os.path.isfile(os.path.join(trajRecFolder, f))]
print(trajectoriesFiles)

In [None]:
from makeStableBaselinesILEnv import makeStableBaselinesILEnv

import tensorflow as tf

from sbUtils import linear_schedule, AutoSave
from customPolicies.customCnnPolicy import CustCnnPolicy, local_nature_cnn_small

from stable_baselines import PPO2

In [None]:
# Diambra Imitation Learning Environment kwargs
diambraILKwargs = {}
diambraILKwargs["trajFilesList"] = trajectoriesFiles
diambraILKwargs["totalCpus"] = algoOpt.totalCpus

hardCore = True if trajRecFolder[-2:] == "HC" else False

# Additional obs key list
keyToAdd = []
keyToAdd.append("actionsBuf") # env.actBufLen*(env.n_actions[0]+env.n_actions[1])

if gameId != "tektagt":                                                         
    keyToAdd.append("ownHealth")   # 1                                            
    keyToAdd.append("oppHealth")   # 1                                                
else:                                                                           
    keyToAdd.append("ownHealth1")    # 1                                             
    keyToAdd.append("ownHealth2")    # 1                                             
    keyToAdd.append("oppHealth1")    # 1                                              
    keyToAdd.append("oppHealth2")    # 1  
    keyToAdd.append("ownActiveChar") # 1
    keyToAdd.append("oppActiveChar") # 1
    
keyToAdd.append("ownPosition")     # 1
keyToAdd.append("oppPosition")     # 1
keyToAdd.append("stage")           # 1
keyToAdd.append("ownChar")       # len(env.charNames)
keyToAdd.append("oppChar")       # len(env.charNames)

In [None]:
envId = "ILTrain"
env = makeStableBaselinesILEnv(envId, diambraILKwargs, timeDepSeed, 
                               hardCore=hardCore, keyToAdd=keyToAdd, 
                               useSubprocess=True)

In [None]:
print("Obs_space = ", env.observation_space)
print("Obs_space type = ", env.observation_space.dtype)
print("Obs_space high = ", env.observation_space.high)
print("Obs_space low = ", env.observation_space.low)

In [None]:
print("Act_space = ", env.action_space)
print("Act_space type = ", env.action_space.dtype)
if diambraGymKwargs["actionSpace"] == "multiDiscrete":
    print("Act_space n = ", env.action_space.nvec)
else:
    print("Act_space n = ", env.action_space.n)

In [None]:
nActions = env.get_attr("nActions")[0]
actBufLen = env.get_attr("actBufLen")[0]
nChars = env.get_attr("nChars")[0]

policyKwargs={}
policyKwargs["n_add_info"] = actBufLen*(nActions[0]+nActions[1]) + len(keyToAdd)-3 + 2*nChar
policyKwargs["layers"] = [64, 64]

if algoOpt.smallCnn:
    policyKwargs["cnn_extractor"] = local_nature_cnn_small

print("nActions =", nActions)
print("nChar =", nChar)
print("n_add_info =", policyKwargs["n_add_info"])

In [None]:
# PPO param
setGamma = 0.94
setLearningRate = linear_schedule(2.5e-4, 2.5e-6)
setClipRange = linear_schedule(0.15, 0.025)
setClipRangeVf = setClipRange

# Initialize the model
model = PPO2(CustCnnPolicy, env, verbose=1, 
             gamma = setGamma, nminibatches=4, noptepochs=4, n_steps=128,
             learning_rate=setLearningRate, cliprange=setClipRange, cliprange_vf=setClipRangeVf, 
             tensorboard_log=tensorBoardFolder, policy_kwargs=policyKwargs)

#OR

# Load the trained agent
#model = PPO2.load(os.path.join(modelFolder, str("_".join(keyToAdd))+"_0M"), env=env, tensorboard_log=tensorBoardFolder, 
#                  policy_kwargs=policyKwargs, gamma = setGamma, learning_rate=setLearningRate, 
#                  cliprange=setClipRange, cliprange_vf=setClipRangeVf)

In [None]:
print("Model discount factor = ", model.gamma)

In [None]:
# Create the callback: autosave every USER DEF steps
autoSaveCallback = AutoSave(check_freq=1000000, numEnv=numEnv, 
                            save_path=os.path.join(modelFolder, str("_".join(keyToAdd))+"_0M"))

# Train the agent
timeSteps = 10000000
model.learn(total_timesteps=timeSteps, callback=autoSaveCallback)

In [None]:
# Save the agent
model.save(os.path.join(modelFolder, str("_".join(keyToAdd))+"_0M"))