# Acrobot

## A2C Agent 

In [None]:
import gym
from a2c import A2CAgent 
import time
import numpy as np

# Create Gym environment
a2c_env = "Acrobot-v1"
env = gym.make(a2c_env)

# Check agent class for initialization parameters and initialize agent
if a2c_env == "Acrobot-v1":
    gamma = 0.95
    lr = 7e-4

agent = A2CAgent(env, gamma, lr)

# Define training parameters
max_episodes = 300
max_steps = 500

episode_rewards = []
run_time = []
start_time = time.time()
for episode in range(max_episodes):
    trajectory = []
    state = env.reset()
    episode_reward = 0
    for step in range(max_steps):
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        trajectory.append((state, action, reward, next_state, done))
        episode_reward += reward  
        if done or step == max_steps:
            episode_rewards.append(episode_reward)
            print("Episode " + str(episode) + ": " + str(episode_reward))
            break
        state = next_state
    agent.update(trajectory, 0)
    elapse = time.time() - start_time
    run_time.append(elapse)
    
a2c_rewards = episode_rewards
a2c_runtime = run_time

In [None]:
name = './log_files/a2c/' + a2c_env + '-' + str(time.time()) + '.csv' 
out = np.column_stack((a2c_runtime, a2c_rewards))
with open(name, 'ab') as f:
    np.savetxt(f, out, delimiter=',')

## DRPO Agent (KL) 

In [10]:
import gym
from drpo import DRTRPOAgent 
import time
import numpy as np

# Create Gym environment
kl_env = "Acrobot-v1"
env = gym.make(kl_env)

# Check agent class for initialization parameters and initialize agent

# When the learning rate is large, policy neural network can overflow and lead to NaNs. 
# A possible fix is to reduce lr or increase beta to lower the learning rate.

if kl_env == "Acrobot-v1":
    gamma = 0.95
    lr = 1e-2
    beta = 0.8

agent = DRTRPOAgent(env, gamma, lr)

# Define training parameters
max_episodes = 150
max_steps = 500

episode_rewards = []
run_time = []
start_time = time.time()
for episode in range(max_episodes):
    if episode == 0:
        first_state = env.reset()
    else:
        first_state = state
    state_adv = []
    total_value_loss = 0
    
    episode_reward = 0
    # loop through the first action
    for i in range(env.action_space.n):
        env.reset()
        state = first_state
        action = i
        trajectory = []
        
        for step in range(max_steps):
            if step != 0:
                action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
            trajectory.append((state, action, reward, next_state, done))
            episode_reward += reward  
            if done or step == max_steps:
                break
            state = next_state
            
        adv, value_loss = agent.compute_adv_mc(trajectory)
        state_adv.append(adv[0])
        total_value_loss += value_loss
        
    avg_episode_reward = episode_reward/env.action_space.n
    # add randomness for better exploration
    beta += np.random.random()*0.1
    if(state_adv[0] == state_adv[1]) and (state_adv[1] == state_adv[2]) and avg_episode_reward  <= -490:
        state_adv[0] += (np.random.random()-0.5)*2
        state_adv[1] += (np.random.random()-0.5)*2
        state_adv[2] += (np.random.random()-0.5)*2
        
    state_adv[1] += 5 
    
    # restart the agent if stuck
    if (episode >= 5) and (avg_episode_reward <= -490):
        agent = DRTRPOAgent(env, gamma, lr)

    policy_loss = agent.compute_policy_loss_kl(first_state, state_adv, beta)
    agent.update(value_loss, policy_loss)
    elapse = time.time() - start_time
    run_time.append(elapse)
    
    episode_rewards.append(avg_episode_reward)
    print("Episode " + str(episode) + ": " + str(avg_episode_reward))

dr_trpo_kl_rewards = episode_rewards
dr_trpo_kl_runtime = run_time

Episode 0: -500.0
Episode 1: -500.0
Episode 2: -500.0
Episode 3: -500.0
Episode 4: -500.0
Episode 5: -500.0
Episode 6: -500.0
Episode 7: -500.0
Episode 8: -500.0
Episode 9: -500.0
Episode 10: -500.0
Episode 11: -500.0
Episode 12: -500.0
Episode 13: -500.0
Episode 14: -500.0
Episode 15: -500.0
Episode 16: -500.0
Episode 17: -500.0
Episode 18: -500.0
Episode 19: -500.0
Episode 20: -462.6666666666667
Episode 21: -500.0
Episode 22: -500.0
Episode 23: -500.0
Episode 24: -500.0
Episode 25: -500.0
Episode 26: -500.0
Episode 27: -410.3333333333333
Episode 28: -500.0
Episode 29: -500.0
Episode 30: -500.0
Episode 31: -500.0
Episode 32: -202.33333333333334
Episode 33: -140.0
Episode 34: -150.0
Episode 35: -185.66666666666666
Episode 36: -170.0
Episode 37: -159.66666666666666
Episode 38: -217.66666666666666
Episode 39: -200.33333333333334
Episode 40: -188.0
Episode 41: -216.66666666666666
Episode 42: -176.33333333333334
Episode 43: -185.33333333333334
Episode 44: -198.66666666666666
Episode 45: -2

In [None]:
name = './log_files/dr_trpo_kl/' + kl_env + '-' + str(time.time()) + '.csv' 
out = np.column_stack((dr_trpo_kl_runtime, dr_trpo_kl_rewards))
with open(name, 'ab') as f:
    np.savetxt(f, out, delimiter=',')

## DRPO Agent (Wasserstein)

In [5]:
import gym
from drpo import DRTRPOAgent 
import time
import numpy as np

wass_env = "Acrobot-v1"
# Create Gym environment
env = gym.make(wass_env)

# Check agent class for initialization parameters and initialize agent
if wass_env == "Acrobot-v1":
    gamma = 0.95
    lr = 1e-2
    
agent = DRTRPOAgent(env, gamma, lr)

# Define training parameters
max_episodes = 150
max_steps = 500
total_adv_diff = 0

episode_rewards = []
run_time = []
start_time = time.time()
for episode in range(max_episodes):
    if episode == 0:
        first_state = env.reset()
    else:
        first_state = state
    state_adv = []
    total_value_loss = 0
    
    episode_reward = 0
    # loop through the first action
    for i in range(env.action_space.n):
        env.reset()
        state = first_state
        action = i
        trajectory = []
        
        for step in range(max_steps):
            if step != 0:
                action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
            trajectory.append((state, action, reward, next_state, done))
            episode_reward += reward  
            if done or step == max_steps:
                break
            state = next_state
            
        adv, value_loss = agent.compute_adv_mc(trajectory)
        state_adv.append(adv[0])
        total_value_loss += value_loss
    
    avg_episode_reward = episode_reward/env.action_space.n
    # add randomness for better exploration 
    if(state_adv[0] == state_adv[1]) and (state_adv[1] == state_adv[2]) and avg_episode_reward < -490:
        state_adv[0] += (np.random.random()-0.5)*2
        state_adv[1] += (np.random.random()-0.5)*2
        state_adv[2] += (np.random.random()-0.5)*2

    state_adv[1] += 5
    
    # restart the agent if stuck
    if (episode >= 5) and (avg_episode_reward <= -490):
        agent = DRTRPOAgent(env, gamma, lr)
        
    total_adv_diff += max(abs(state_adv[1] - state_adv[0]), abs(state_adv[2] - state_adv[0]), abs(state_adv[2] - state_adv[1]))
    beta = total_adv_diff/episode
    policy_loss = agent.compute_policy_loss_wass(first_state, state_adv, beta)

    agent.update(value_loss, policy_loss)
    elapse = time.time() - start_time
    run_time.append(elapse)
    
    episode_rewards.append(avg_episode_reward)
    print("Episode " + str(episode) + ": " + str(avg_episode_reward))

dr_trpo_wass_rewards = episode_rewards
dr_trpo_wass_runtime = run_time

Episode 0: -500.0
Episode 1: -500.0
Episode 2: -496.0
Episode 3: -439.6666666666667
Episode 4: -476.3333333333333
Episode 5: -407.0
Episode 6: -500.0
Episode 7: -500.0
Episode 8: -500.0
Episode 9: -500.0
Episode 10: -500.0
Episode 11: -193.66666666666666
Episode 12: -174.33333333333334
Episode 13: -152.66666666666666
Episode 14: -159.33333333333334
Episode 15: -120.33333333333333
Episode 16: -147.33333333333334
Episode 17: -131.33333333333334
Episode 18: -152.66666666666666
Episode 19: -130.0
Episode 20: -134.33333333333334
Episode 21: -145.33333333333334
Episode 22: -136.66666666666666
Episode 23: -147.33333333333334
Episode 24: -131.33333333333334
Episode 25: -137.33333333333334
Episode 26: -126.33333333333333
Episode 27: -141.66666666666666
Episode 28: -135.33333333333334
Episode 29: -126.33333333333333
Episode 30: -132.33333333333334
Episode 31: -161.33333333333334
Episode 32: -156.66666666666666
Episode 33: -138.66666666666666
Episode 34: -136.0
Episode 35: -120.66666666666667
Epi

In [4]:
name = './log_files/dr_trpo_wass/' + wass_env + '-' + str(time.time()) + '.csv' 
out = np.column_stack((dr_trpo_wass_runtime, dr_trpo_wass_rewards))
with open(name, 'ab') as f:
    np.savetxt(f, out, delimiter=',')

## DRPO Agent (Sinkhorn)

In [None]:
import gym
from drpo import DRTRPOAgent 
import time
import numpy as np

sink_env = "Acrobot-v1"
# Create Gym environment
env = gym.make(sink_env)

# Check agent class for initialization parameters and initialize agent
if sink_env == "Acrobot-v1":
    gamma = 0.95
    lr = 1e-2
    
agent = DRTRPOAgent(env, gamma, lr)

# Define training parameters
max_episodes = 150
max_steps = 500
total_adv_diff = 0

episode_rewards = []
run_time = []
start_time = time.time()
for episode in range(max_episodes):
    if episode == 0:
        first_state = env.reset()
    else:
        first_state = state
    state_adv = []
    total_value_loss = 0
    
    episode_reward = 0
    # loop through the first action
    for i in range(env.action_space.n):
        env.reset()raakute
        state = first_state
        action = i
        trajectory = []
        
        for step in range(max_steps):
            if step != 0:
                action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
            trajectory.append((state, action, reward, next_state, done))
            episode_reward += reward  
            if done or step == max_steps:
                break
            state = next_state
            
        adv, value_loss = agent.compute_adv_mc(trajectory)
        state_adv.append(adv[0])
        total_value_loss += value_loss
    
    avg_episode_reward = episode_reward/env.action_space.n
    # add randomness for better exploration 
    if(state_adv[0] == state_adv[1]) and (state_adv[1] == state_adv[2]) and avg_episode_reward < -490:
        state_adv[0] += (np.random.random()-0.5)*2
        state_adv[1] += (np.random.random()-0.5)*2
        state_adv[2] += (np.random.random()-0.5)*2
        
    state_adv[1] += 5

    # restart the agent if stuck
    if (episode >= 5) and (avg_episode_reward <= -490):
        agent = DRTRPOAgent(env, gamma, lr)
        
    total_adv_diff += max(abs(state_adv[1] - state_adv[0]), abs(state_adv[2] - state_adv[0]), abs(state_adv[2] - state_adv[1]))
    beta = 40
    policy_loss = agent.compute_policy_loss_sinkhorn(first_state, state_adv, beta)

        
    agent.update(value_loss, policy_loss)
    elapse = time.time() - start_time
    run_time.append(elapse)
    
    episode_rewards.append(avg_episode_reward)
    print("Episode " + str(episode) + ": " + str(avg_episode_reward))

dr_trpo_sink_rewards = episode_rewards
dr_trpo_sink_runtime = run_time

In [None]:
name = './log_files/dr_trpo_sink/' + sink_env + '-' + str(time.time()) + '.csv' 
out = np.column_stack((dr_trpo_sink_runtime, dr_trpo_sink_rewards))
with open(name, 'ab') as f:
    np.savetxt(f, out, delimiter=',')