In [97]:
import gym
import numpy as np
import torch
import torch.nn as nn

In [98]:
env = gym.make("FrozenLake-v0")

In [99]:
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [100]:
# Get the True Rewards

#TODO
true_rewards = np.zeros(16)
true_rewards[15] = 1


In [101]:
def q_from_v(env, V, s, gamma=1):
    
    q = np.zeros(env.nA)
    for a in range(env.nA):
        for prob, next_state, reward, done in env.P[s][a]:
            q[a] += prob * (reward + gamma * V[next_state])
    return q

In [102]:
def policy_improvement(env, V, gamma=1):
    policy = np.zeros([env.nS, env.nA]) / env.nA
    for s in range(env.nS):
        q = q_from_v(env, V, s, gamma)
        
        # OPTION 1: construct a deterministic policy 
        # policy[s][np.argmax(q)] = 1
        
        # OPTION 2: construct a stochastic policy that puts equal probability on maximizing actions
        best_a = np.argwhere(q==np.max(q)).flatten()
        policy[s] = np.sum([np.eye(env.nA)[i] for i in best_a], axis=0)/len(best_a)
        
    return policy


In [103]:
def value_iteration(env, gamma=1, theta=1e-8):
    V = np.zeros(env.nS)
    while True:
        delta = 0
        for s in range(env.nS):
            v = V[s]
            V[s] = max(q_from_v(env, V, s, gamma))
            delta = max(delta,abs(V[s]-v))
        if delta < theta:
            break
    policy = policy_improvement(env, V, gamma)
    return policy, V

In [104]:
policy_vi, V_vi = value_iteration(env.env)

# print the optimal policy
print("\nOptimal Policy (LEFT = 0, DOWN = 1, RIGHT = 2, UP = 3):")
print(policy_vi,"\n")


Optimal Policy (LEFT = 0, DOWN = 1, RIGHT = 2, UP = 3):
[[1.   0.   0.   0.  ]
 [0.   0.   0.   1.  ]
 [0.   0.   0.   1.  ]
 [0.   0.   0.   1.  ]
 [1.   0.   0.   0.  ]
 [0.25 0.25 0.25 0.25]
 [0.5  0.   0.5  0.  ]
 [0.25 0.25 0.25 0.25]
 [0.   0.   0.   1.  ]
 [0.   1.   0.   0.  ]
 [1.   0.   0.   0.  ]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.   0.   1.   0.  ]
 [0.   1.   0.   0.  ]
 [0.25 0.25 0.25 0.25]] 



In [105]:
class MyRewardWrapper(gym.Wrapper):
    
    def __init__(self, env, rewards):
        super(MyRewardWrapper, self).__init__(env)
        self.rewards = rewards
        
    def step(self, action):
        next_state, true_reward, terminated, info = self.env.step(action)
        reward = self.rewards[next_state]
        return next_state, reward, terminated, info

In [94]:
# r = np.random.randn(16)
# env = gym.make('FrozenLake-v0')
# env = MyRewardWrapper(env, rewards = r)

In [106]:
# Generate Expert Trajectories Using Optimal Policy
num_trajs = 1000
expert_trajs = []
for i in range(num_trajs):
    traj = []
    total_reward = 0
    state = env.reset()
    is_done = False
    while True:
        action = np.argmax(policy_vi[state])
        traj.append((state, action))
        next_state, reward, is_done, _ = env.step(action)
        total_reward += reward
        state = next_state
        if is_done:
            break
        
    expert_trajs.append(traj)
        
        
    

In [107]:
original_env = gym.make('FrozenLake-v0')
# original_env.env.P


# adding +1 to account for absorbing state
        # (reached whenever game ended)
n_states = original_env.observation_space.n + 1
n_actions = original_env.action_space.n

transitions = np.zeros([n_states, n_actions, n_states])

# iterate over all "from" states:
for state, transitions_given_state in original_env.env.P.items():
    # iterate over all actions:
    for action, outcomes in transitions_given_state.items():
        # iterate over all possible outcomes:
        for probability, next_state, _, done in outcomes:
            # add transition probability T(s, a, s')
            transitions[state, action, next_state] += probability
            if done:
                # outcome was marked as ending the game.
                # if game is done and state == next_state, map to absorbing state instead
                if state == next_state:
                    transitions[state, action, next_state] = 0
                # map next state to absorbing state
                # make sure that next state wasn't mapped to any other state yet
                assert np.sum(transitions[next_state, :, :-1]) == 0
                transitions[next_state, :, -1] = 1.0

# specify transition probabilities for absorbing state:
        # returning to itself for all actions.
transitions[-1, :, -1] = 1.0

In [129]:
def compute_svf(trajs, transition_matrix, policy):
    
#     longest_traj_len = np.max([len(traj) for traj in trajs()])
    longest_traj_len = 100
    nS = 16 # TODO: remove hardcoding
    nA = 4 
            
        
    # svf[state, time] is the frequency of visiting a state at some point of time
    svf = np.zeros((nS, longest_traj_len))

    for traj in trajs:
        for state in traj[:,0]:
#             print(state)
            svf[int(state), 0] += 1
    svf[:, 0] = svf[:, 0] / len(trajs)

    for time in range(1, longest_traj_len):
        for state in range(nS):
            total = 0
            for previous_state in range(nS):
                for action in range(nA):
                    total += svf[
                        previous_state, time - 1] * transition_matrix[
                            previous_state, action, state] * policy[
                                previous_state, action]
            svf[state, time] = total
    # sum over all time steps and return SVF for each state:
    return np.sum(svf, axis=1)   

# compute_svf(trajs, transitions, policy = np.random.randn(16, 4))


In [130]:
# reshape expert trajectories for convenience

trajs = np.zeros((1000, 100, 2))

for i, traj in enumerate(expert_trajs):
    tmp = np.zeros((100, 2))
    tmp[:np.array(traj).shape[0]] = np.array(traj)
    trajs[i] = tmp
    

In [139]:
# ME_IRL Begins

## initialize reward fucntion

# reward is a Fully Connected NN with two layers

# reward_model = nn.Sequential(nn.Linear(100, 32),
#                       nn.ReLU(),
#                       nn.Linear(32, 16))


num_iters = 100

theta = np.random.randn(16)
print(theta)
# rewards = torch.matmul(theta, torch.Tensor(np.eye(16)))

def rf(state):
    return theta[int(state)]

def rt(traj):
    return np.sum([rf(state) for state in traj[:, 0]])

def grad(trajs, theta, svf):
    
    first_term = []
    for traj in trajs:
        drtaudpsi = np.zeros(16)
        for i, state in enumerate(traj[:, 0]):
            drtaudpsi[int(state)] = rf(state)
        first_term.append(drtaudpsi)
    first_term = np.sum(first_term, axis = 0)
    
#     svf = compute_svf(trajs, transitions, policy)
    for state in range(16):
        second_term = np.zeros(16)
        
        thing = svf[state]*rf(state)
        second_term[state] = thing
    
    gr = first_term - 1000*second_term
    
    return gr


# the training loop

lr = 0.001
        
    
for i in range(num_iters):
    # wrap env with current reward estimate
    env_now = MyRewardWrapper(env, rewards)
    # solve using value iteration
    policy, _ = value_iteration(env_now.env.env)
    
    # get state-visitation-frquency
    
    svf = compute_svf(trajs,transitions, policy)
    
    gr = grad(trajs, theta, svf)
    
   
    theta -= gr*lr
#     print(theta)
    
    
print(theta)
    
    



[ 0.49370283  0.83372422 -0.47692964 -0.31729862  1.15796439  0.60233496
 -0.60407374  1.7635248   0.43640086  0.60269814  0.31637755 -2.35735379
  1.16237767  1.71425276  0.43251223  0.30506572]
[ 0.00000000e+000  2.96311838e-007 -1.19610884e-007 -1.82892919e-005
  0.00000000e+000  6.02334959e-001 -1.83298045e-020  1.76352480e+000
  0.00000000e+000  3.10617073e-253  9.94352710e-060 -2.35735379e+000
  1.16237767e+000  1.81500025e-065  2.42321984e-079  2.24850692e+187]


In [None]:
r = np.random.randn(16)
r = torch.Tensor(r)
reward(r)

In [38]:
r = reward_model(torch.Tensor(trajs[2][:,0]))
a = torch.sum(r)
a.backward()


grads = []
params = reward_model.parameters()
for i, param in enumerate(params):
    grads.append(param.grad)
    

for i, param in enumerate(params):
    param -= grads[i]

    

In [53]:
reward(r)

tensor([ 0.3010, -0.0735,  0.0607,  0.2353, -0.0014,  0.3910,  0.0201,  0.0372,
         0.0367, -0.0868, -0.0566,  0.2312, -0.1701,  0.0402, -0.1171,  0.0302],
       grad_fn=<AddBackward0>)

In [None]:
def reward_from_traj(

)

In [55]:
reward.grad()

AttributeError: 'Sequential' object has no attribute 'grad'

In [56]:
from gym.core import RewardWrapper


In [57]:
env = gym.make('FrozenLake-v0')

In [58]:
env = RewardWrapper(env)

In [59]:
env


<RewardWrapper<TimeLimit<FrozenLakeEnv<FrozenLake-v0>>>>

In [61]:
env.reward(r)



AttributeError: 'RewardWrapper' object has no attribute '_reward'