In [2]:
import gym

In [80]:
import gym_tetris
from gym_tetris.actions import MOVEMENT
from nes_py.wrappers import JoypadSpace
from scipy.stats import norm
from scipy.stats import multivariate_normal
import numpy as np
from collections import namedtuple
from IPython.display import clear_output

In [244]:
class RadialBasisFunction(object):
    
    def __init__(self, means, scales):
        self.means = means
        self.scales = scales
        self.norms = [multivariate_normal(mean, scale) for mean, scale in zip(means, scales)]
    
    def __call__(self, x):
        return np.array([1] + [rv.pdf(x) for rv in self.norms])
    
    def __len__(self):
        return len(self.norms) + 1

class Identity(object):
    
    def __init__(self, n):
        self.n = n
    
    def __call__(self, observation):
        return np.array([1] + list(observation))
    
    def __len__(self):
        return self.n + 1
    
class Policy(object):
    
    def __init__(self, feature_function, action_space, weights=None, eta=0.1, alpha=0.1):
        self.feature_function = feature_function
        self.action_space = action_space
        self.eta = eta
        self.alpha = alpha
        
        if weights is None:
            weights = []
            for a in range(action_space.n):
                weights.append(np.random.normal(loc=0.0, scale=0.2, size=len(feature_function)))
                self.weights = np.array(weights)
        else:
            self.weights = weights

    def evaluate(self, observation):
        f = self.feature_function(observation)
        values = self.weights.dot(f)
        return values
            
    def select_action(self, observation, eta_greedy=True):
        r = np.random.uniform()
        if r < self.eta and eta_greedy is True:
            return self.action_space.sample()
        else:
            values = self.evaluate(observation)
            action_id = np.argmax(values)
            return action_id
        
    def update(self, delta, observation, action):
        self.weights[action] = self.weights[action] + self.alpha * delta * self.feature_function(observation)
        
        
State = namedtuple("State", ["observation", "reward", "action"])

In [263]:


def Sarsa(policy, env_name="MountainCar-v0", alpha=0.1, gamma=1, max_steps=1000):

    d = 0.9

    env = gym.make(env_name)
    observation = env.reset()

    cumulative_reward = 0

    action = policy.select_action(observation)

    trace = []

    max_loc = observation[0]
    
    for t in range(max_steps):
#         env.render()

        # take action A observe R (reward) and S' (observation)
        new_observation, reward, done, info = env.step(action)
        
        if env_name == "MountainCar-v0":
            done = new_observation[0] > 0.5
    
        max_loc = max(max_loc, new_observation[0])
        
        # choose A' from S' using policy derived from Q
        new_action = policy.select_action(observation)

        #delta <- R + Q(s' a') - Q(s, a)
        delta = reward - gamma * policy.evaluate(new_observation)[new_action] - policy.evaluate(observation)[action]

        policy.update(delta, observation, action)

        cumulative_reward += reward
        
        action = new_action
        observation = new_observation
        
        if done:
            break
        
    print(f"Episode finished after {t+1} steps with reward {cumulative_reward}")
    print(f"Max location reached: {max_loc}")
    env.close()
    return cumulative_reward
        


env_name = "CartPole-v0"
env = gym.make("CartPole-v0")
observation = env.reset()
observation_dims = len(observation)
env.close()

means = [np.random.normal(size=observation_dims) for i in range(10)]
scales = [np.random.uniform(size=(observation_dims, observation_dims)) for i in range(10)]

    
# means = [[0, 0], [1, 1], [-1, -1], [-1, 1], [1, -1]]
# scales = [[[1, 0], [0,1]]]*5
rbf = RadialBasisFunction(means, scales)

# identity = Identity(n=4)


policy = Policy(rbf, env.action_space, alpha=alpha, eta=.1)

for episode in range(10000):
    clear_output(wait=True)
    print("episode", episode+1)
    reward = Sarsa(policy, env_name='CartPole-v0')
    

ValueError: the input matrix must be positive semidefinite

In [254]:
def do_episode(env_name, policy, max_steps=1000):
    env = gym.make("CartPole-v0")
    observation = env.reset()
    cumulative_reward = 0
    for i in range(max_steps):
        env.render()
        
        action = policy.select_action(observation)
        observation, reward, done, info = env.step(action)
        
        cumulative_reward += reward
        
        if done:
            break
    print("Reward:", cumulative_reward)
    env.close()

In [262]:
do_episode("CartPole-v0", policy)


Reward: 40.0


In [239]:
env = gym.make("CartPole-v0")
observation = env.reset()
observation

array([-0.01589927, -0.00277189,  0.00858706,  0.00064014])

In [72]:

gamma = 1
d = 0.9
alpha = 0.1

env = gym.make("MountainCar-v0")
observation = env.reset()



means = [[0, 0], [1, 1], [-1, -1], [-1, 1], [1, -1]]
scales = [[[1, 0], [0,1]]]*5
rbf = RadialBasisFunction(means, scales)

e = np.zeros(len(means))

policy = Policy(rbf, env.action_space, alpha=alpha)


cumulative_reward = 0



action = policy.select_action(observation)

trace = []

print(policy.weights)

for t in range(1000):
    env.render()
    
    # take action A observe R (reward) and S' (observation)
    new_observation, reward, done, info = env.step(action)
    
    if new_observation[]
    
    # choose A' from S' using policy derived from Q
    new_action = policy.select_action(observation)

    #delta <- R + Q(s' a') - Q(s, a)
    delta = reward - gamma * policy.evaluate(new_observation)[new_action] - policy.evaluate(observation)[action]
    
    policy.update(delta, observation, action)
    
    cumulative_reward += reward
    if done:
        print(done)
        break
print(f"Episode finished after {t+1} steps with reward {cumulative_reward}")
env.close()

print(policy.weights)

TypeError: __init__() got an unexpected keyword argument 'max_episode_steps'

In [84]:
env = gym.make("MountainCar-v0")
observation = env.reset()

In [85]:
observation

array([-0.46302643,  0.        ])

In [97]:
env = gym.make("MountainCar-v0")
observation = env.reset()
print(observation)
env.render()
new_observation, reward, done, info = env.step(2)
print(new_observation)



[-0.58936787  0.        ]
[-0.58787779  0.00149007]


In [None]:
new_observation, reward, done, info = env.step(0)
print(new_observation)
env.render()

In [174]:
new_observation, reward, done, info = env.step(1)
print(new_observation)
env.render()

[-8.84988196e-01 -5.68178691e-04]


True

In [229]:
new_observation, reward, done, info = env.step(2)
print(new_observation)
env.render()

[0.51100641 0.01819706]


True

In [231]:
reward

-1.0

In [74]:
env.close()


Help on package gym:

NAME
    gym

PACKAGE CONTENTS
    core
    envs (package)
    error
    logger
    spaces (package)
    utils (package)
    vector (package)
    version
    wrappers (package)

CLASSES
    builtins.object
        gym.core.Env
            gym.core.Wrapper
        gym.spaces.space.Space
    
    class Env(builtins.object)
     |  The main OpenAI Gym class. It encapsulates an environment with
     |  arbitrary behind-the-scenes dynamics. An environment can be
     |  partially or fully observed.
     |  
     |  The main API methods that users of this class need to know are:
     |  
     |      step
     |      reset
     |      render
     |      close
     |      seed
     |  
     |  And set the following attributes:
     |  
     |      action_space: The Space object corresponding to valid actions
     |      observation_space: The Space object corresponding to valid observations
     |      reward_range: A tuple corresponding to the min and max possible reward

In [None]:
gym.make()

In [66]:
env.close()

In [13]:
# env = gym_tetris.make('TetrisA-v0')
# env = JoypadSpace(env, MOVEMENT)
# done = True
# for step in range(5000):
#     if done:
#         state = env.reset()
#     state, reward, done, info = env.step(env.action_space.sample())
#     env.render()

# env.close()

In [15]:
state

array([[[  0, 252, 252],
        [  0, 252, 252],
        [  0, 252, 252],
        ...,
        [  0, 252, 252],
        [  0, 252, 252],
        [  0,   0,   0]],

       [[124, 124, 124],
        [124, 124, 124],
        [124, 124, 124],
        ...,
        [124, 124, 124],
        [124, 124, 124],
        [  0,   0,   0]],

       [[124, 124, 124],
        [124, 124, 124],
        [124, 124, 124],
        ...,
        [124, 124, 124],
        [124, 124, 124],
        [  0,   0,   0]],

       ...,

       [[  0, 252, 252],
        [124, 124, 124],
        [124, 124, 124],
        ...,
        [124, 124, 124],
        [124, 124, 124],
        [  0,   0,   0]],

       [[  0, 252, 252],
        [124, 124, 124],
        [124, 124, 124],
        ...,
        [124, 124, 124],
        [124, 124, 124],
        [  0,   0,   0]],

       [[  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0],
        ...,
        [124, 124, 124],
        [124, 124, 124],
        [  0,   0,   0]]

In [16]:
reward

0