# Configururation

### Enviornment Configuration

In [12]:
ENV_USE_FRAME_STACK = False
ENV_FRAME_STACK_COUNT = 4
ENV_USE_GRAYSCALE_OBSERVATION = False

### Model Configuration

In [13]:
MODEL_USE_CNN = True
MODEL_DEVICE = "auto" # "auto", "cuda", "cpu", or "mps"

### Traning Configuration

In [14]:
TRAINING_NAME = "baseline-CNN-MPS-5"
TRAINING_EVAL_FREQUENCY = 5000                      # How often the agent should be evaluated
TRAINING_EVAL_TIMES = 5                             # How many simulations to perform per evaluation

TRAINING_TOTAL_TIMESTEPS = 300000                   # Total number timesteps before training is complete

TRAINING_LEARNING_RATE = 1e-5                       # Learning rate
TRAINING_LEARNING_STARTS_AT = 10000                 # Timesteps to perform before learing begins

TRAINING_REPLAY_BUFFER_SIZE = 100000                # Number of episode-replays to save in the replay buffer
TRAINING_BATCH_SIZE = 32                            # Number of replays to pick each gradient
TRAINING_TARGET_UPDATE_INTERVAL = 1                 # How often the target network will be updated by online network
TRAINING_GRADIENT_STEPS = 1                         # How many gradient steps shall be performed per training

TRAINING_USE_PRIORITIZED_REPLAY = False             # Wether or not to use Prioritized Replay
TRAINING_PRIORITIZED_REPLAY_EPS = 1e-5              # Epsilon of prioritized replay
TRAINING_PRIORITIZED_REPLAY_BETA_START = 1.0        # Start beta of prioritized replay, beta controls the importance of prioritized replays
TRAINING_PRIORITIZED_REPLAY_BETA_END = 0.1          # End beta-value
TRAINING_PRIORITIZED_REPLAY_BETA_FRACTION = 0.55    # Fraction of total training time for beta-decrease

TRAINING_EPS_START = 0.9                            # Start value for epsilon, epsilon controls exploration rate
TRAINING_EPS_END = 0.0095                           # End value for epsilon
TRAINING_EPS_FRACTION = 0.5                         # Fraction of total training time for epsilon-decrease

TRAINING_GAMMA = 0.99                               # Static gamma, controls how importance uncertain rewards in far future (1.0) and the ones in the near future (0.0)
TRAINING_TAU = 0.96                                 # Static tau, controls how much of the online network shall be copied to the target network at each copy.

# Setup Minedojo Environment

In [15]:
from Environments import SkyRunner, MultithreadGym

# Multithreaded environment wrapper
env = MultithreadGym.MultithreadGym(thread_int=1, env_int=1,
    frame_stack=ENV_USE_FRAME_STACK,
    frames_int=ENV_FRAME_STACK_COUNT,
    use_grayscale=ENV_USE_GRAYSCALE_OBSERVATION
)

starting Reloader 0
stopping Reloader 0


# Load EVAL-Enviornment

In [None]:
# Evaluation environment
eval_env = SkyRunner.CustomEnv(
    frame_stack=ENV_USE_FRAME_STACK,
    frames_int=ENV_FRAME_STACK_COUNT,
    use_grayscale=ENV_USE_GRAYSCALE_OBSERVATION)
eval_env.reset()

# Begin training

In [None]:
import train_openAI
import importlib

importlib.reload(train_openAI)

train_openAI.train(
    env=env,
    eval_env=eval_env,
    name=TRAINING_NAME,
    eval_freq=TRAINING_EVAL_FREQUENCY,                  
    n_eval_episodes=TRAINING_EVAL_TIMES,                           
    total_timesteps=TRAINING_TOTAL_TIMESTEPS,                 
    learning_rate=TRAINING_LEARNING_RATE,                     
    learning_starts=TRAINING_LEARNING_STARTS_AT,             
    buffer_size=TRAINING_REPLAY_BUFFER_SIZE,              
    batch_size=TRAINING_BATCH_SIZE,                          
    target_update_interval=TRAINING_TARGET_UPDATE_INTERVAL,              
    gradient_steps=TRAINING_GRADIENT_STEPS,                       
    use_prioritized_replay=TRAINING_USE_PRIORITIZED_REPLAY,           
    prioritized_replay_eps=TRAINING_PRIORITIZED_REPLAY_EPS,           
    prioritized_replay_initial_beta=TRAINING_PRIORITIZED_REPLAY_BETA_START,      
    prioritized_replay_final_beta=TRAINING_PRIORITIZED_REPLAY_BETA_END,        
    prioritized_replay_beta_fraction=TRAINING_PRIORITIZED_REPLAY_BETA_FRACTION,  
    exploration_initial_eps=TRAINING_EPS_START,                          
    exploration_final_eps=TRAINING_EPS_END,                         
    exploration_fraction=TRAINING_EPS_FRACTION ,                       
    gamma=TRAINING_GAMMA,                            
    tau=TRAINING_TAU,
    use_cnn=MODEL_USE_CNN,
    device=MODEL_DEVICE                               
)

# Load existing model

In [None]:
from stable_baselines3 import DQN
from CustomBaselines3.DoubleDQN import DoubleDQN

model = DoubleDQN.load("./dDQN-checkpoints/" + TRAINING_NAME + "/final_model.zip")

obs = env.reset()
acc_r = 0
while True:
    act, st = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(act)

    acc_r += reward

    env.render()

    if done:
        obs = env.reset()
        print("Finished with reward %d" % acc_r)
        acc_r = 0

# Shutdown Environments

In [None]:
env.close()
eval_env.close()