# Base Environment Description

- No Obstructions in Unity Enviroment, just Bi-Pedal Agent & Reward Box 
- Thinking of adding randomly generated obstructions in later iterations (need add anything new in the Unity Design Engine on MacBook)
- Python Environment is included in directory inside of "/.env" use `conda activate /.env` to access

In [2]:
# Load in Custom Env -- View Head Mode
import gym
import time

from mlagents_envs.environment import UnityEnvironment
from gym_unity.envs import UnityToGymWrapper

#"/Volumes/GoogleDrive/My Drive/Colab/IDS 576 - Deep Learning/Final Project/ml-agents/gym-unity/gym_unity/envs/Environment-1.app" #Env's home during project
#"/Users/ckg-files/UnityTest/CustomUnityEnvironments/Environment-1.app" #moved to git dir

# This is the custom Unity Environment that we pass to the `Gym` Wrapper
CUSTOM_ENV_PATH = "CustomUnityEnvironments/Environment-1.x86_64" #extension is ".app" on MacOS
unity_env = UnityEnvironment(CUSTOM_ENV_PATH, 
                             no_graphics=True) #no_graphics=True to run in headless mode
# On Custom Unity Environments:
#BaseEnvironment.app has no obstructions -- may add if we have time to teach agent to walk around
#first env was: `CKG-Walker.app` -- reskin of the provided one (with a single agent, cant do multiple if using custom model)

# Send to OpenAI Gym
env = UnityToGymWrapper(unity_env, uint8_visual=False, flatten_branched=False, allow_multiple_obs=False)

[INFO] Connected to Unity environment with package version 2.1.0-exp.1 and communication version 1.5.0
[INFO] Connected new brain: Walker?team=0




## Explore Env

- Check the input & outputs of Actions/States in the user created env 

In [3]:
# Get Info about Env

env.action_size #39 degrees of freedom (joints over which agency is given to the agent)
env.action_space #Gym Like Description of Action_Space (from the Unity Env)

env.observation_space #Input Vector of shape (243, ) -- 1D vector of length 243 (each val represents the Walker's body postion relative to the floor?)

env.name #Walker is the name of the Env! 

'Walker?team=0'

In [4]:
# Get Environment Params 
print(f"Action Space Shape: {env.action_space.shape} - {env.action_space.dtype}") #all actions are continuous but bounded between {-1 & 1} (i.e lower & upper bounds)
print(f"Observation Space Shape: {env.observation_space.shape} - {env.observation_space.dtype}")
#observation space is unbounded, could be any value {-inf to inf}

env.action_size #39 degrees of freedom (joints over which agency is given to the agent)
env.observation_space #Input Vector of shape (243, ) -- 1D vector of length 243

env.name #Walker is the name of the Env! 
print("\n")

# Actual Action & Observation Spaces
print("Action Space:", env.action_space) #all actions are discrete movements
print("Observation Space:", env.observation_space) #Continuous Observations

Action Space Shape: (39,) - float32
Observation Space Shape: (243,) - float32


Action Space: Box([-1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1.], [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.], (39,), float32)
Observation Space: Box([-inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf
 -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf
 -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf
 -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf
 -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf
 -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf
 -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf
 -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf
 -inf -inf -i

# Neural Network Description

Chosen Model - Proximal Policy Optimization (PPO Continuous)

RL model, this effectively means that for `on policy` learning the function that approximates the optimal policy learns from its actions as it takes them, an off-policy model would instead learn it's policy function by taking a series of random actions & mapping the value derived from them

<BR>

**References**

[OpenAI Post on PPO](https://openai.com/blog/openai-baselines-ppo/)

[On-Policy Vs. Off-Policy RL Models](https://stats.stackexchange.com/questions/184657/what-is-the-difference-between-off-policy-and-on-policy-learning) 

[Upload PyTorch Model into Unity](https://medium.com/@a.abelhopereira/how-to-use-pytorch-models-in-unity-aa1e964d3374)

[RL Overview - OpenAI](https://spinningup.openai.com/en/latest/spinningup/rl_intro2.html)

<BR>

**Model Architecture**

Input = [243, 1] -- What the network sees

Output = [39, 1] -- Possible actions, 

Reward = 

In [5]:
# SAMPLE PPO From minimalRL
import gym
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal
import numpy as np

from mlagents_envs.environment import UnityEnvironment
from gym_unity.envs import UnityToGymWrapper


# HyperParams -- Originals
learning_rate  = 0.0003
gamma          = 0.9
lmbda          = 0.9
eps_clip       = 0.2
K_epoch        = 10
rollout_len    = 3
buffer_size    = 30
minibatch_size = 32


# UDFs
class PPO(nn.Module):
    def __init__(self):
        super(PPO, self).__init__()
        self.data = []
        
        # Layers
        self.fc1     = nn.Linear(243, 128) #input is observation space of len 243
        self.fc_mu   = nn.Linear(128, 39) #need 39 as output for all `inbetween` layers, all feed into the output layer!
        self.fc_std  = nn.Linear(128, 39)
        self.fc_v    = nn.Linear(128, 39) #output determines our 39 joints movements
        
        # Original Layers
        #self.fc1   = nn.Linear(3,128)
        #self.fc_mu = nn.Linear(128,1)
        #self.fc_std  = nn.Linear(128,1)
        #self.fc_v = nn.Linear(128,1)

        # Optimizer
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        self.optimization_step = 0


    def pi(self, x, softmax_dim = 0):
        x = F.relu(self.fc1(x))
        mu = 2.0 * torch.tanh(self.fc_mu(x))
        std = F.softplus(self.fc_std(x))
        return mu, std


    def v(self, x):
        x = F.relu(self.fc1(x))
        v = self.fc_v(x)
        return v


    def put_data(self, transition):
        self.data.append(transition)


    def make_batch(self):
        s_batch, a_batch, r_batch, s_prime_batch, prob_a_batch, done_batch = [], [], [], [], [], []
        data = []

        for j in range(buffer_size):
            for i in range(minibatch_size):
                rollout = self.data.pop()
                s_lst, a_lst, r_lst, s_prime_lst, prob_a_lst, done_lst = [], [], [], [], [], []

                for transition in rollout:
                    s, a, r, s_prime, prob_a, done = transition

                    # Original Implementation - Just Lists of Scalar Value or Lists of Scalars
                    #s_lst.append(s)
                    #a_lst.append([a])
                    #r_lst.append([r])
                    #s_prime_lst.append(s_prime)
                    #prob_a_lst.append([prob_a])
                    #done_mask = 0 if done else 1
                    #done_lst.append([done_mask])
                    
                    # CKG Added - Convert to Tensors (still appending to lists? will try to change)
                    #s_lst.append(torch.tensor(s, dtype=torch.float))
                    #a_lst.append(torch.tensor(a, dtype=torch.float))
                    #r_lst.append([r])
                    #s_prime_lst.append(torch.tensor(s_prime, dtype=torch.float))
                    #prob_a_lst.append(torch.tensor(prob_a, dtype=torch.float))
                    #done_mask = 0 if done else 1
                    #done_lst.append(torch.tensor(done_mask, dtype=torch.float))

                    # 2nd Attempt -- convert all to tensors and pass to batch lists (iter through list and access tensors instead of lists of scalars -- in original)
                    s = torch.tensor(s, dtype=torch.float)
                    a = torch.tensor(a, dtype=torch.float)
                    r = torch.tensor(r, dtype=torch.float) #scalar reward value!
                    s_prime = torch.tensor(s_prime, dtype=torch.float)
                    prob_a = torch.tensor(prob_a, dtype=torch.float)
                    done_mask = 0 if done else 1
                    done_mask = torch.tensor(done_mask, dtype=torch.float)

                # CKG Version -- List of Tensors (should work with matmuls and is a list per batch!) -- WILL TRY APPENDING DIRECTLY TO MINI-BATCH STATEMENT
                #s_batch.append(s)
                #a_batch.append(a)
                #r_batch.append(r)
                #s_prime_batch.append(s_prime)
                #prob_a_batch.append(prob_a)
                #done_batch.append(done_mask)
                
                # Original Appending, lists of lists, CKG Version above
                #s_batch.append(s_lst)
                #a_batch.append(a_lst)
                #r_batch.append(r_lst)
                #s_prime_batch.append(s_prime_lst)
                #prob_a_batch.append(prob_a_lst)
                #done_batch.append(done_lst)

            # Original - cast to tensor as end
#            mini_batch =(torch.tensor(s_batch, dtype=torch.float), 
#                         torch.tensor(a_batch, dtype=torch.float), \
#                         torch.tensor(r_batch, dtype=torch.float), \
#                         torch.tensor(s_prime_batch, dtype=torch.float), \
#                         torch.tensor(done_batch, dtype=torch.float), \
#                         torch.tensor(prob_a_batch, dtype=torch.float))
            
            # Create MiniBatch
            #  mini_batch = torch.tensor(s_batch, dtype=torch.float), torch.tensor(a_batch, dtype=torch.float), \ #original Version, change to tensors at end
            #              torch.tensor(r_batch, dtype=torch.float), torch.tensor(s_prime_batch, dtype=torch.float), \
            #              torch.tensor(done_batch, dtype=torch.float), torch.tensor(prob_a_batch, dtype=torch.float)

            #mini_batch = [s_batch, a_batch, r_batch, s_prime_batch, done_batch, prob_a_batch] #CKG - Version 1 (still working with lists)
            
            mini_batch = [s, a, r, s_prime, prob_a, done_mask] #HEAVILY Modified from original, but works?
            data.append(mini_batch)

        return data


    def calc_advantage(self, data):
        data_with_adv = []
        for mini_batch in data:
            s, a, r, s_prime, done_mask, old_log_prob = mini_batch
            with torch.no_grad():
                td_target = r + gamma * self.v(s_prime) * done_mask
                delta = td_target - self.v(s)
            delta = delta.numpy()

            advantage_lst = []
            advantage = 0.0
            for delta_t in delta[::-1]:
                advantage = gamma * lmbda * advantage + delta_t[0]
                advantage_lst.append([advantage])
            advantage_lst.reverse()
            advantage = torch.tensor(advantage_lst, dtype=torch.float)
            data_with_adv.append((s, a, r, s_prime, done_mask, old_log_prob, td_target, advantage))

        return data_with_adv


    def train_net(self):
        if len(self.data) == minibatch_size * buffer_size:
            data = self.make_batch()
            data = self.calc_advantage(data)

            for i in range(K_epoch):
                for mini_batch in data:
                    s, a, r, s_prime, done_mask, old_log_prob, td_target, advantage = mini_batch

                    mu, std = self.pi(s, softmax_dim=1)
                    dist = Normal(mu, std)
                    log_prob = dist.log_prob(a)
                    ratio = torch.exp(log_prob - old_log_prob)  # a/b == exp(log(a)-log(b))

                    surr1 = ratio * advantage
                    surr2 = torch.clamp(ratio, 1-eps_clip, 1+eps_clip) * advantage
                    loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(self.v(s) , td_target)

                    self.optimizer.zero_grad()
                    loss.mean().backward() 
                    nn.utils.clip_grad_norm_(self.parameters(), 1.0)
                    self.optimizer.step()
                    self.optimization_step += 1


In [7]:
# Close Env if crashed
env.close()

In [8]:
# Run Above Model
unity_env = UnityEnvironment(CUSTOM_ENV_PATH, 
                             no_graphics=False) #no_graphics=True to run in headless mode
env = UnityToGymWrapper(unity_env, uint8_visual=False, flatten_branched=False, allow_multiple_obs=True) #Originally passed False ot allow_multiple_obs
model = PPO()
score = 0.0
print_interval = 20
mission_failed = 0 #keep track of number of failures
rollout = []
training_start_time = time.time()

for n_epoch in range(1000):
    s = env.reset()
    done = False
    while not done:
        for t in range(rollout_len):
            s = np.array(s) #was getting error as `s` was being returned as a list, not np array
            mu, std = model.pi(torch.from_numpy(s).float())
            dist = Normal(mu, std)
            a = dist.sample()
            log_prob = dist.log_prob(a)
            #s_prime, r, done, info = env.step([a.item()]) #original
            s_prime, r, done, info = env.step(a) #not a scalar anymore
            s_prime = np.array(s_prime)

            #print("s-", s.shape)
            #print("a-", a.shape)
            #print("r-", r/10.0)
            #print("s_prime-", s_prime.shape)
            #print("log_prob-", log_prob.shape)
            #print("done-", done)
            
            #rollout.append((s, a.item(), r/10.0, s_prime, log_prob.item(), done)) #original
            rollout.append((s, a, r/10.0, s_prime, log_prob, done))
            if len(rollout) == rollout_len:
                model.put_data(rollout)
                rollout = []

            s = s_prime
            score += r
            if done:
                mission_failed += 1 #we'll get them next time!
                break

        # If Model Stayed Up, Train!
        model.train_net()

        # Save Model if Score is Good

    if n_epoch%print_interval==0 and n_epoch!=0:
        #print("# of episode :{}, avg score : {:.1f}, opt step: {}".format(n_epoch, score/print_interval, model.optimization_step)) #original training info print statement
        
        # Calc Time Elapsed
        time_elapsed = time.time() - training_start_time
        elapsed_time = 'Training Time: {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60)
        
        # Print out Summary Training Information per Print Interval
        print(f"Epochs: {n_epoch} | Average Score: {score/print_interval:.2f} | Optimization Steps: {model.optimization_step} | Missions Failed: {mission_failed} | {elapsed_time}")
        score = 0.0 #reset score

env.close()


UnityEnvironmentException: Couldn't launch the /Users/ckg-files/AgentWalker/CustomUnityEnvironments/Environment-1.app environment. Provided filename does not match any environments.