In [None]:
import numpy as np

# from evaluation.mc import *
# from utils.misc import *
# from policies import *
import gym
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from collections import defaultdict
from tqdm import tqdm

class EpsilonGreedyPolicy(object):
    def __init__(self, actions, Q, epsilon):
        self.actions = actions
        self.Q = Q
        self.epsilon = epsilon

    def get_probs(self, states, actions):
        probs = np.full(len(states), self.epsilon/len(actions))    
        index = np.random.choice(np.flatnonzero(self.Q[states[0]] == self.Q[states[0]].max()))
        probs[index] += 1-self.epsilon
        return probs


    def sample_action(self, state):
        probs = self.get_probs([state for i in range(len(self.actions))], self.actions)
        action = np.random.choice(self.actions, p=probs)
        return action

class RandomPolicy(object):

    def __init__(self, actions):
        self.actions = actions

    def get_probs(self, states, actions):
        probs = np.full(len(states), 1. / len(self.actions))
        return probs

    def sample_action(self, state):
        probs = self.get_probs([state for i in range(len(self.actions))], self.actions)

        action = np.random.choice(self.actions, p=probs)
        return action





def mc_weighted_importance_sampling(env, behavior_policy, target_policy, num_episodes, sampling_function,
                                    discount_factor=1.0):
    """
    Monte Carlo prediction algorithm. Calculates the value function
    for a given target policy using behavior policy and weighted importance sampling.

    Args:
        env: OpenAI gym environment.
        behavior_policy: A policy used to collect the data.
        target_policy: A policy which value function we want to estimate.
        num_episodes: Number of episodes to sample.
        discount_factor: Gamma discount factor.
        sampling_function: Function that generates data from one episode.

    Returns:
        A dictionary that maps from state -> value.
        The state is a tuple and the value is a float.
    """

    # Keeps track of current V and count of returns for each state
    # to calculate an update.
    V = defaultdict(float)

    C = defaultdict(float)

    returns_count = defaultdict(float)

    # YOUR CODE HERE
    for i in tqdm(range(num_episodes)):
        G = 0

        states, actions, rewards, dones = sampling_function(env, behavior_policy)
        W = 1

        T = len(states)
        for t in range(T - 1, -1, -1):
            state = states[t]
            action = actions[t]
            reward = rewards[t]

            returns_count[state] += 1
            G = discount_factor * G + reward
            C[(state, action)] = C[(state, action)] + W
            W = W * target_policy.get_probs([state], [action])[0] / behavior_policy.get_probs([state], [action])[0]

            # Update formula
            V[state] = V[state] + W * (G - V[state]) / C[(state, action)]

            if W == 0:
                break

    return V
def sample_episode(env, policy):
    """
    A sampling routine. Given environment and a policy samples one episode and returns states, actions, rewards
    and dones from environment's step function and policy's sample_action function as lists.

    Args:
        env: OpenAI gym environment.
        policy: A policy which allows us to sample actions with its sample_action method.

    Returns:
        Tuple of lists (states, actions, rewards, dones). All lists should have same length.
        Hint: Do not include the state after the termination in the list of states.
    """
    states = []
    actions = []
    rewards = []
    dones = []

    state = env.reset()

    while True:
        states.append(state)

        action = policy.sample_action(state)
        state, reward, done, _ = env.step(action)

        actions.append(action)
        rewards.append(reward)
        dones.append(done)
        if done == True:
            break

    return states, actions, rewards, dones
###


In [None]:
n = 7

Q = np.zeros((n, 2))

for state in range(n):
    Q[state,  0] = 1

from gym.envs.registration import register
register( id='NChainN-v0', entry_point='gym.envs.toy_text:NChainEnv', kwargs={'n': n} )
env = gym.make('NChainN-v0')
env.n = n
print(env.action_space)
print(Q)

# Let's sample some episodes
actions = [0,1]
target_policy = EpsilonGreedyPolicy(actions, Q, 0.1)
behavior_policy = RandomPolicy(actions)
print(1)
for episode in range(3):
    trajectory_data = sample_episode(env, behavior_policy)
    print("Episode {}:\nStates {}\nActions {}\nRewards {}\nDones {}\n".format(episode,*trajectory_data))
print(2)
np.random.seed(42)
V_10k = mc_weighted_importance_sampling(env,behavior_policy , target_policy, 10000, sample_episode)
# V_500k = mc_weighted_importance_sampling(env, behavior_policy, target_policy, 500000, sample_episode)


# Vs = [V_10k, V_500k]
# x = np.arange(n)
# fig = plt.figure(figsize=(20, 10))

# for i in Vs:
#     number_episodes = 1e4 if i == V_10k else 1e5
#     plt.scatter(x,[i[j] for j in x], label= str(number_episodes))
# plt.xlabel('States')
# plt.ylabel('V')
# plt.show()



### First we run the env with random agent

print(V_10k)


Discrete(2)
[[1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]]
1
Discrete(2)
[[1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]]
1
