In [1]:
import gym
import numpy as np
from Modules import Q_Learner, Inverse_Learner

In [2]:
# Define the abstract method features()

class My_Q_Learner(Q_Learner):
    def features(self, state, action):
        feat1 = state[2:3]
        action_sign = 2*action - 1
        feat2 = np.abs([state[2] + action_sign])
        bias = np.ones(1)
        return np.concatenate((feat1, feat2, bias))

In [3]:
# Generate the environment
env = gym.make('CartPole-v0')

# Parameters
actions_arr = np.arange(env.action_space.n)
d = 3 # <--- set the right number of features
learning_rate = 0.01
epsilon = 0.9
discount_factor = 0.95

# Initialize the agent
agent = My_Q_Learner(actions_arr, d, learning_rate, 
                     epsilon, discount_factor)

# Train the agent on a number of matches (num_episodes)
# For each episode count the number of rounds the agent survived
num_episodes = 500
for i in range(num_episodes):
    state = env.reset()
    done = False
    rounds = 0
    while not done:
        action = agent.best_action(state, training=True)
        old_state = state
        state, reward, done, info = env.step(action)
        agent.update_parameters(old_state, state, action, reward)
        rounds += 1

print("\n--> Game Over. Rounds: {}".format(rounds))
print("Parameter vector:\n{}".format(agent.theta))

  result = entry_point.load(False)



--> Game Over. Rounds: 53
Parameter vector:
[0.33598688 0.67030228 0.66167036]


In [4]:
# Find the mean number of rounds the agent can survive
num_episodes = 100
rounds = 0
for i in range(num_episodes):
    state = env.reset()
    done = False
    while not done:
        action = agent.best_action(state, training=False)
        state, _, done, _ = env.step(action)
        rounds += 1
print("On average, the agent survives {} rounds.\n".format(rounds/num_episodes))

On average, the agent survives 41.64 rounds.



In [5]:
# Define the abstract methods

class My_Inverse_Learner(Inverse_Learner):
    def features(self, old_state, action, state):
        feat1 = np.abs(state) - np.abs(old_state)
        bias = np.ones(1)
        return np.concatenate((feat1, bias))

    def environment(self):
        return gym.make('CartPole-v0')
    
    def imitator(self):
        # Parameters
        actions_arr = np.arange(env.action_space.n)
        d = 3 # <--- set the right number of features
        learning_rate = 0.01
        epsilon = 0.9
        discount_factor = 0.95

        imitator = My_Q_Learner(actions_arr, d, learning_rate, 
                             epsilon, discount_factor)
        return imitator

In [6]:
def produce_trajectory(agent, env):
    state = env.reset()
    trajectory = [state]
    done = False
    while(not done):
        action = agent.best_action(state, training=False)
        state, _, done, _ = env.step(action)
        trajectory.append(action)
        trajectory.append(state)
    return trajectory

In [7]:
# we use the agent trained in the cells above

# Generate trajectories with the agent already trained
num_trajectories = 10
expert_trajectories = []
for i in range(num_trajectories):
    expert_trajectories.append(produce_trajectory(agent, env))

In [8]:
# Initialize the reward-learning environment

features_dim = 5 # <--- set the number of features of the reward function
discount_factor = 0.95

irl = My_Inverse_Learner(features_dim, expert_trajectories, discount_factor)
irl.initialize_parameters(num_matches=10)

In [9]:
# Search for the reward function parameters
for i in range(100):
    irl.update_parameters(num_matches=10)

In [10]:
# Imitation: Train a new agent w.r.t. the reward we just learned
imitator = My_Q_Learner(actions_arr, d, learning_rate, 
                     epsilon, discount_factor)

imitator = irl.train_agent(imitator)

In [11]:
print("Expert parameters:\t{}\nImitator parameters:\t{}".format(agent.theta, imitator.theta))

Expert parameters:	[0.33598688 0.67030228 0.66167036]
Imitator parameters:	[-0.07825517  0.73000616  0.6789456 ]


In [12]:
# Find the mean number of rounds the imitator can survive
num_episodes = 100
rounds = 0
for i in range(num_episodes):
    state = env.reset()
    done = False
    while not done:
        action = imitator.best_action(state, training=False)
        state, _, done, _ = env.step(action)
        rounds += 1
print("On average, the imitator survives {} rounds.\n".format(rounds/num_episodes))

On average, the agent survives 41.33 rounds.

